' . ' ' . ' ' . 'ABLE'); // list of OpenCalais entities we want to process - edit this array if different entities required $entities = array('City', 'Continent', 'Country', 'ProvinceOrState', 'Region'); // miscellaneous processing variables $a = array(); // list of found entities foreach ($entities as $e) { // multi-dimensional array, so each entity type in own sub-array $a[$e][] = $e; // set first entry to name of entity } $content = ''; // temp holder as read input to pass to OpenCalais $length_of_content = 0; // length of temp holder immediately above /** * main processing */ echo "Reading input...\n"; // chunk input for processing while (!feof($fin)) { $line = fgets($fin); // read next line $sline = strip_tags($line); // remove XML leaving only text $uline = urlencode($sline); // convert text ready for http call $length_of_uline = strlen($uline); // how much text in this line $new_length_of_content = $length_of_content + $length_of_uline; // so how much text in total so far if (3000 < $new_length_of_content) { // is too long to add another line echo "Calling OpenCalais...\n"; $response = call_oc($licenseID,$content, $paramsXML); echo "Processing response...\n"; process_oc($response); $content = $uline; // start new content $length_of_content = $length_of_uline; // and set length to match } else { $content .= $uline; // else append new content $length_of_content = $new_length_of_content; // and keep track of new length } } // process any left over input if ('' != $content) { echo "Calling OpenCalais...\n"; $response = call_oc($licenseID, $content, $paramsXML); echo "Processing response...\n"; process_oc($response); } // print out multi-dimensional array prepared in process_oc foreach ($a as $entity_type=>$entity_value) { foreach ($entity_value as $v) { fwrite($fout, $v."\n"); } } /** * close down script */ fclose($fin) or exit("Unable to close input file {$fn_in}\n"); fclose($fout) or exit("Unable to close output file: {$fn_out}\n"); echo "Goodbye from run_opencalais\n"; // Functions ********************************************** /** * calls OpenCalais web service for name entity recognition * @param $licenseID OpenCalais API key * @param $content urlencoded raw text from XML source * @param $paramsXML OpenCalais parameters * @return $response OpenCalais reply - either entities or error message */ function call_oc($licenseID, $content, $paramsXML) { if ('' != $content) { // sanity check! $call = "http://api.opencalais.com/enlighten/rest/?licenseID={$licenseID}&content={$content}¶msXML={$paramsXML}"; $response = file_get_contents($call, FILE_BINARY); // capture OC result as a file stream, ensure can cope with 'funny' characters if (strpos($response, "") !== false) { // oops, so let's see the message $text = preg_match("/\(.*)\<\/Exception\>/mu", $response, $matches); echo $matches[1]; } } return $response; } /** * processes OpenCalais XML to extract desired entities and format them for printing * only processes the entities specified in the $entities array * @param string $response */ function process_oc($response) { // ensure OpenCalais returned something to process if ('' != $response) { global $a, $entities; $dom = new DomDocument; $dom->loadXML($response); foreach($entities as $e) { // look in OpenCalais output for entries in our entity list $names = $dom->getElementsByTagName($e); foreach($names as $n) { $tmp = $n->textContent; if (stripos($tmp, "\n") !== false) { $tmp = 'TWO '.$tmp; } // ensure no duplicates if (!in_array($tmp, $a[$e])) { $a[$e][] = $tmp; } } } } } ?>