' .
' ' .
' ' .
'ABLE');
// list of OpenCalais entities we want to process - edit this array if different entities required
$entities = array('Country');
// miscellaneous processing variables
$found_names = array(); // list of found entities
foreach ($entities as $e) { // multi-dimensional array, so each entity type in own sub-array
$found_names[$e][] = $e; // set first item of each sub-array to name of entity
}
$content = ''; // temp holding place for processed input to pass to OpenCalais
$length_of_content = 0; // temp record of how much content is to be sent to OpenCalais
$length_of_uline = 0; // temp record of how much content is in the just urlencoded line
$new_length_of_content = 0; // temp record of how much content could be sent to OpenCalais, will break if more than 3000 chars. sent
$response = ''; // temp record of OpenCalais output
$xline = ''; // temp holding place for raw input to pass to apply_oc()
/**
* main processing
*/
echo "Reading input...\n";
// chunk input for processing
while (!feof($fin)) {
$line = fgets($fin); // read next line
$sline = strip_tags($line); // remove XML leaving only text
$uline = urlencode($sline); // convert text ready for http call
$length_of_uline = strlen($uline); // how much text in this line
$new_length_of_content = $length_of_content + $length_of_uline; // so how much text in total so far
if (3000 < $new_length_of_content) { // is too long to add another line
echo "Calling OpenCalais...\n";
$response = call_oc($licenseID, $content, $paramsXML); // so send what accumulated so far
echo "Applying response...\n";
apply_oc($response, $xline); // add semantic mark up to XML
echo "Preserving response...\n";
preserve_oc($response); // save response to a text file
$content = $uline; // start new content
$length_of_content = $length_of_uline; // and set length to match
$xline = $line; // start new XML buffer
} else {
$content .= $uline; // else append new content
$length_of_content = $new_length_of_content; // and keep track of new length
$xline .= $line; // append source to XML buffer
}
}
// process any left over input
if ('' != $content) {
echo "Calling OpenCalais...\n";
$response = call_oc($licenseID, $content, $paramsXML);
apply_oc($response, $xline);
echo "Preserving response...\n";
preserve_oc($response);
}
// print out multi-dimensional array prepared in process_oc
foreach ($found_names as $entity_type=>$entity_value) {
foreach ($entity_value as $v) {
fwrite($fout1, $v."\n");
}
}
/**
* close down script
*/
fclose($fin) or exit("Unable to close input file {$fn_in}\n");
fclose($fout1) or exit("Unable to close output file: {$fn_out1}\n");
fclose($fout2) or exit("Unable to close output file: {$fn_out2}\n");
echo "Goodbye from run_opencalais\n";
// Functions **********************************************
/**
* applies relevant OpenCalais XML response entities
* extracts entities specified in $entities array and applies them to source annotated TEI XML
* @param $response string OpenCalais reply
*/
function preserve_oc($response) {
global $fout2, $xline;
fwrite($fout2, $xline);
return null;
}
/**
* calls OpenCalais web service for name entity recognition
* @param $licenseID OpenCalais API key
* @param $content urlencoded raw text from XML source
* @param $paramsXML OpenCalais parameters
* @return $result OpenCalais reply - either entities or error message
*/
function call_oc($licenseID, $content, $paramsXML) {
if ('' != $content) { // sanity check!
$call = "http://api.opencalais.com/enlighten/rest/?licenseID={$licenseID}&content={$content}¶msXML={$paramsXML}";
$result = file_get_contents($call, FILE_BINARY); // capture OC result as a file stream, ensure can cope with 'funny' characters
if (strpos($result, "") !== false) { // oops, so let's see the message
$text = preg_match("/\(.*)\<\/Exception\>/mu", $result, $matches);
echo $matches[1];
}
}
return $result;
}
/**
* applies relevant OpenCalais XML response entities
* extracts entities specified in $entities array
* and applies them to source XML
* and adds to array for printing later
* @param $response OpenCalais reply
* @param $xline source text passed as a reference so this function adds mark up to original XML and not to a copy of it
*/
function apply_oc($response, &$xline) {
global $entities, $found_names;
// ensure OpenCalais returned something to process
if ('' != $response) {
$dom = new DomDocument;
$dom->loadXML($response);
foreach($entities as $e) {
// look in OpenCalais output for entries in our entity list
$names = $dom->getElementsByTagName($e);
foreach($names as $n) {
$tmp = $n->textContent;
// temporarily removed handling of keywords when broken over two lines
//if (stripos($tmp, "\n") !== false) {
// $tmp = 'TWO '.$tmp;
//} else {
$xline = str_replace($tmp, ''.$tmp.'', $xline);
//}
// ensure no duplicates otherwise save in found names array
if (!in_array($tmp, $found_names[$e])) {
$found_names[$e][] = $tmp;
}
}
}
}
}
?>