open($fn_in, 'UTF-8') or exit("Failed to open input xml file: {$fn_in} \n");;
//$fn_hd = '/host/ABLE/BoB/wip/'.$volume.'_teiHeader.xsl';
$fn_hd = 'C:\ABLE\BoB\wip\\'.$volume.'_teiheader.xsl';
$fhd = fopen($fn_hd, 'r') or exit("Failed to open input xsl file: {$fn_hd} \n");
//$fn_out = '/host/ABLE/BoB/wip/'.$volume.'_abbyy_to_tei_by_xmlreader.xml';
$fn_out = 'C:\ABLE\BoB\wip\\'.$volume.'_abbyy_to_tei_by_xmlreader.xml';
$fout = fopen($fn_out, 'w') or exit("Failed to open output xml file: {$fn_out} \n");
//define TEI XML
$tei_begin = "
";
$tei_middle = "\n";
$tei_end = "\n";
// intialise variables
$hi_count = 0;
$old_fontsize = 0;
$output = '';
$write = false;
// main processing
echo "Writing TEI header\n";
fwrite($fout, $tei_begin);
while ($buffer = fgets($fhd)) {
if ('') != false) {
break;
}
}
echo "Writing TEI text\n";
fwrite($fout, $tei_middle);
while ($xml->read()) {
/*
if ($xml->nodeType == XMLREADER::TEXT) {
$output .= $xml->readString(); // doesn't work! spaces not retrieved! hence replacement below using charParams element
*/
if ($xml->nodeType == XMLREADER::ELEMENT && $xml->localName =='charParams') {
$xml->read();
$char = $xml->value;
$output .= htmlspecialchars($char, ENT_QUOTES, 'UTF-8');
} elseif ($xml->nodeType == XMLREADER::ELEMENT && $xml->localName =='formatting') {
if ($xml->moveToAttribute('bold')) {
$output .= '';
$hi_count++;
}
if ($xml->moveToAttribute('italic')) {
$output .= '';
$hi_count++;
}
if ($xml->moveToAttribute('fs')) {
$t = $xml->value;
if ($old_fontsize !== $t) {
$output .= '';
$old_fontsize = $t;
$hi_count++;
}
}
} elseif ($xml->nodeType == XMLREADER::ELEMENT && $xml->localName =='page') {
fwrite($fout, "\n");
} elseif ($xml->nodeType == XMLREADER::ELEMENT && $xml->localName =='par') {
fwrite($fout, '');
} elseif ($xml->nodeType == XMLREADER::END_ELEMENT && $xml->localName =='formatting') {
for ($i = 0; $i < $hi_count; ++$i) {
$output .= '';
}
$hi_count = 0;
} elseif ($xml->nodeType == XMLREADER::END_ELEMENT && $xml->localName =='line') {
fwrite($fout, $output.'');
$output ='';
} elseif ($xml->nodeType == XMLREADER::END_ELEMENT && $xml->localName =='page') {
fwrite($fout, "\n\n");
} elseif ($xml->nodeType == XMLREADER::END_ELEMENT && $xml->localName =='par') {
fwrite($fout, "\n");
}
}
fwrite($fout, $tei_end);
// close down script
$xml->close($fn_in) or exit("Failed to close input xml file: {$fn_in} \n");;
fclose($fhd) or exit("Failed to close input xsl file: {$fn_hd} \n");
fclose($fout) or exit("Failed to close output text file: {$fn_out} \n");
echo "\nGoodbye from ABBYY to TEI by XMLReader\n";
?>