' . ' ' . ' ' . 'ABLE'); // list of OpenCalais entities we want to process - edit this array if different entities required $entities = array('Country'); // miscellaneous processing variables $found_names = array(); // list of found entities foreach ($entities as $e) { // multi-dimensional array, so each entity type in own sub-array $found_names[$e][] = $e; // set first item of each sub-array to name of entity } $content = ''; // temp holding place for processed input to pass to OpenCalais $length_of_content = 0; // temp record of how much content is to be sent to OpenCalais $length_of_uline = 0; // temp record of how much content is in the just urlencoded line $new_length_of_content = 0; // temp record of how much content could be sent to OpenCalais, will break if more than 3000 chars. sent $response = ''; // temp record of OpenCalais output $xline = ''; // temp holding place for raw input to pass to apply_oc() /** * main processing */ echo "Reading input...\n"; // chunk input for processing while (!feof($fin)) { $line = fgets($fin); // read next line $sline = strip_tags($line); // remove XML leaving only text $uline = urlencode($sline); // convert text ready for http call $length_of_uline = strlen($uline); // how much text in this line $new_length_of_content = $length_of_content + $length_of_uline; // so how much text in total so far if (3000 < $new_length_of_content) { // is too long to add another line echo "Calling OpenCalais...\n"; $response = call_oc($licenseID, $content, $paramsXML); // so send what accumulated so far echo "Applying response...\n"; apply_oc($response, $xline); // add semantic mark up to XML echo "Preserving response...\n"; preserve_oc($response); // save response to a text file $content = $uline; // start new content $length_of_content = $length_of_uline; // and set length to match $xline = $line; // start new XML buffer } else { $content .= $uline; // else append new content $length_of_content = $new_length_of_content; // and keep track of new length $xline .= $line; // append source to XML buffer } } // process any left over input if ('' != $content) { echo "Calling OpenCalais...\n"; $response = call_oc($licenseID, $content, $paramsXML); apply_oc($response, $xline); echo "Preserving response...\n"; preserve_oc($response); } // print out multi-dimensional array prepared in process_oc foreach ($found_names as $entity_type=>$entity_value) { foreach ($entity_value as $v) { fwrite($fout1, $v."\n"); } } /** * close down script */ fclose($fin) or exit("Unable to close input file {$fn_in}\n"); fclose($fout1) or exit("Unable to close output file: {$fn_out1}\n"); fclose($fout2) or exit("Unable to close output file: {$fn_out2}\n"); echo "Goodbye from run_opencalais\n"; // Functions ********************************************** /** * applies relevant OpenCalais XML response entities * extracts entities specified in $entities array and applies them to source annotated TEI XML * @param $response string OpenCalais reply */ function preserve_oc($response) { global $fout2, $xline; fwrite($fout2, $xline); return null; } /** * calls OpenCalais web service for name entity recognition * @param $licenseID OpenCalais API key * @param $content urlencoded raw text from XML source * @param $paramsXML OpenCalais parameters * @return $result OpenCalais reply - either entities or error message */ function call_oc($licenseID, $content, $paramsXML) { if ('' != $content) { // sanity check! $call = "http://api.opencalais.com/enlighten/rest/?licenseID={$licenseID}&content={$content}¶msXML={$paramsXML}"; $result = file_get_contents($call, FILE_BINARY); // capture OC result as a file stream, ensure can cope with 'funny' characters if (strpos($result, "") !== false) { // oops, so let's see the message $text = preg_match("/\(.*)\<\/Exception\>/mu", $result, $matches); echo $matches[1]; } } return $result; } /** * applies relevant OpenCalais XML response entities * extracts entities specified in $entities array * and applies them to source XML * and adds to array for printing later * @param $response OpenCalais reply * @param $xline source text passed as a reference so this function adds mark up to original XML and not to a copy of it */ function apply_oc($response, &$xline) { global $entities, $found_names; // ensure OpenCalais returned something to process if ('' != $response) { $dom = new DomDocument; $dom->loadXML($response); foreach($entities as $e) { // look in OpenCalais output for entries in our entity list $names = $dom->getElementsByTagName($e); foreach($names as $n) { $tmp = $n->textContent; // temporarily removed handling of keywords when broken over two lines //if (stripos($tmp, "\n") !== false) { // $tmp = 'TWO '.$tmp; //} else { $xline = str_replace($tmp, ''.$tmp.'', $xline); //} // ensure no duplicates otherwise save in found names array if (!in_array($tmp, $found_names[$e])) { $found_names[$e][] = $tmp; } } } } } ?>