' .
' ' .
' ' .
'ABLE');
// list of OpenCalais entities we want to process - edit this array if different entities required
$entities = array('City', 'Continent', 'Country', 'ProvinceOrState', 'Region');
// miscellaneous processing variables
$a = array(); // list of found entities
foreach ($entities as $e) { // multi-dimensional array, so each entity type in own sub-array
$a[$e][] = $e; // set first entry to name of entity
}
$content = ''; // temp holder as read input to pass to OpenCalais
$length_of_content = 0; // length of temp holder immediately above
/**
* main processing
*/
echo "Reading input...\n";
// chunk input for processing
while (!feof($fin)) {
$line = fgets($fin); // read next line
$sline = strip_tags($line); // remove XML leaving only text
$uline = urlencode($sline); // convert text ready for http call
$length_of_uline = strlen($uline); // how much text in this line
$new_length_of_content = $length_of_content + $length_of_uline; // so how much text in total so far
if (3000 < $new_length_of_content) { // is too long to add another line
echo "Calling OpenCalais...\n";
$response = call_oc($licenseID,$content, $paramsXML);
echo "Processing response...\n";
process_oc($response);
$content = $uline; // start new content
$length_of_content = $length_of_uline; // and set length to match
} else {
$content .= $uline; // else append new content
$length_of_content = $new_length_of_content; // and keep track of new length
}
}
// process any left over input
if ('' != $content) {
echo "Calling OpenCalais...\n";
$response = call_oc($licenseID, $content, $paramsXML);
echo "Processing response...\n";
process_oc($response);
}
// print out multi-dimensional array prepared in process_oc
foreach ($a as $entity_type=>$entity_value) {
foreach ($entity_value as $v) {
fwrite($fout, $v."\n");
}
}
/**
* close down script
*/
fclose($fin) or exit("Unable to close input file {$fn_in}\n");
fclose($fout) or exit("Unable to close output file: {$fn_out}\n");
echo "Goodbye from run_opencalais\n";
// Functions **********************************************
/**
* calls OpenCalais web service for name entity recognition
* @param $licenseID OpenCalais API key
* @param $content urlencoded raw text from XML source
* @param $paramsXML OpenCalais parameters
* @return $response OpenCalais reply - either entities or error message
*/
function call_oc($licenseID, $content, $paramsXML) {
if ('' != $content) { // sanity check!
$call = "http://api.opencalais.com/enlighten/rest/?licenseID={$licenseID}&content={$content}¶msXML={$paramsXML}";
$response = file_get_contents($call, FILE_BINARY); // capture OC result as a file stream, ensure can cope with 'funny' characters
if (strpos($response, "") !== false) { // oops, so let's see the message
$text = preg_match("/\(.*)\<\/Exception\>/mu", $response, $matches);
echo $matches[1];
}
}
return $response;
}
/**
* processes OpenCalais XML to extract desired entities and format them for printing
* only processes the entities specified in the $entities array
* @param string $response
*/
function process_oc($response) {
// ensure OpenCalais returned something to process
if ('' != $response) {
global $a, $entities;
$dom = new DomDocument;
$dom->loadXML($response);
foreach($entities as $e) {
// look in OpenCalais output for entries in our entity list
$names = $dom->getElementsByTagName($e);
foreach($names as $n) {
$tmp = $n->textContent;
if (stripos($tmp, "\n") !== false) {
$tmp = 'TWO '.$tmp;
}
// ensure no duplicates
if (!in_array($tmp, $a[$e])) {
$a[$e][] = $tmp;
}
}
}
}
}
?>