public function parse($ocrInput) { $ocrInput = $this->cleanOcrInput($ocrInput); if (!$ocrInput) { $this->errorStr = 'FATAL ERROR: Input string is null'; $this->logMsg($this->errorStr); return false; } $this->logMsg("Parsing OCR:\n" . $ocrInput); //Parse $salixManager = new SpecProcNlpSalix(); $dwcArr = $salixManager->parse($ocrInput); if (!$dwcArr) { $this->errorStr = 'NOTICE: Parser failed to return any data'; $this->logMsg($this->errorStr); return false; } $dwcArr = $this->cleanDwcArr($dwcArr); if (!$dwcArr) { $this->errorStr = 'NOTICE: Parsed data empty after cleaning'; $this->logMsg($this->errorStr); return false; } //Format return $retStr = ''; if ($this->returnFormat == 'json') { $retStr = json_encode($dwcArr); } elseif ($this->returnFormat == 'xml') { $root = '<?xml version="1.0" encoding="' . strtoupper($this->charset) . '"?><DwcRecordSet xmlns="http://rs.tdwg.org/dwc/xsd/simpledarwincore/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://rs.tdwg.org/dwc/xsd/simpledarwincore/ http://rs.tdwg.org/dwc/xsd/tdwg_dwc_simple.xsd"></DwcRecordSet>'; $xml = new SimpleXMLElement($root); $xmlRec = $xml->addChild('SimpleDarwinRecord', ''); foreach ($dwcArr as $k => $v) { $xmlRec->addChild($k, $v); } $retStr = $xml->asXML(); } elseif ($this->returnFormat == 'html') { foreach ($dwcArr as $k => $v) { $retStr .= '<b>' . $k . '</b>: ' . $v . '<br/>'; } } else { $retStr = implode(',', $dwcArr); } $this->logMsg($retStr); return $retStr; }
include_once $serverRoot . '/classes/SpecProcNlpUtilities.php'; include_once $serverRoot . '/classes/SpecProcNlpSalix.php'; header("Content-Type: text/html; charset=UTF-8"); $rawOcr = $_REQUEST['rawocr']; $debug = 0; $nlpManager = new SpecProcNlpSalix(); $dwcArr = array(); if ($rawOcr) { //Get rid of curly (smart) quotes $search = array("’", "‘", "`", "”", "“"); $replace = array("'", "'", "'", '"', '"'); $rawOcr = str_replace($search, $replace, $rawOcr); //Get rid of UTF-8 curly smart quotes and dashes $badwordchars = array("‘", "’", "“", "â€�", "—", "…"); $fixedwordchars = array("'", "'", '"', '"', '-', '...'); $rawOcr = str_replace($badwordchars, $fixedwordchars, $rawOcr); $dwcArr = $nlpManager->parse($rawOcr); if ($debug) { $fh = fopen($serverRoot . '/temp/logs/ocrdebug.txt', 'w'); fwrite($fh, 'Raw OCR:'); fwrite($fh, $rawOcr); fwrite($fh, "\n\n\n------------------------------------------------------------------\n\n\n"); fwrite($fh, 'Parsed data:'); foreach ($dwcArr as $k => $v) { fwrite($fh, $k . ': ' . $v . "\n"); } fclose($fh); } $dwcArr = SpecProcNlpUtilities::cleanDwcArr($dwcArr); } echo json_encode($dwcArr);