public function parse($ocrInput)
 {
     $ocrInput = $this->cleanOcrInput($ocrInput);
     if (!$ocrInput) {
         $this->errorStr = 'FATAL ERROR: Input string is null';
         $this->logMsg($this->errorStr);
         return false;
     }
     $this->logMsg("Parsing OCR:\n" . $ocrInput);
     //Parse
     $salixManager = new SpecProcNlpSalix();
     $dwcArr = $salixManager->parse($ocrInput);
     if (!$dwcArr) {
         $this->errorStr = 'NOTICE: Parser failed to return any data';
         $this->logMsg($this->errorStr);
         return false;
     }
     $dwcArr = $this->cleanDwcArr($dwcArr);
     if (!$dwcArr) {
         $this->errorStr = 'NOTICE: Parsed data empty after cleaning';
         $this->logMsg($this->errorStr);
         return false;
     }
     //Format return
     $retStr = '';
     if ($this->returnFormat == 'json') {
         $retStr = json_encode($dwcArr);
     } elseif ($this->returnFormat == 'xml') {
         $root = '<?xml version="1.0" encoding="' . strtoupper($this->charset) . '"?><DwcRecordSet xmlns="http://rs.tdwg.org/dwc/xsd/simpledarwincore/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://rs.tdwg.org/dwc/xsd/simpledarwincore/ http://rs.tdwg.org/dwc/xsd/tdwg_dwc_simple.xsd"></DwcRecordSet>';
         $xml = new SimpleXMLElement($root);
         $xmlRec = $xml->addChild('SimpleDarwinRecord', '');
         foreach ($dwcArr as $k => $v) {
             $xmlRec->addChild($k, $v);
         }
         $retStr = $xml->asXML();
     } elseif ($this->returnFormat == 'html') {
         foreach ($dwcArr as $k => $v) {
             $retStr .= '<b>' . $k . '</b>: ' . $v . '<br/>';
         }
     } else {
         $retStr = implode(',', $dwcArr);
     }
     $this->logMsg($retStr);
     return $retStr;
 }
Beispiel #2
0
<?php

//error_reporting(E_ALL);
error_reporting(0);
include_once '../../../config/symbini.php';
include_once $serverRoot . '/classes/SpecProcNlpUtilities.php';
include_once $serverRoot . '/classes/SpecProcNlpSalix.php';
header("Content-Type: text/html; charset=UTF-8");
$rawOcr = $_REQUEST['rawocr'];
$debug = 0;
$nlpManager = new SpecProcNlpSalix();
$dwcArr = array();
if ($rawOcr) {
    //Get rid of curly (smart) quotes
    $search = array("’", "‘", "`", "”", "“");
    $replace = array("'", "'", "'", '"', '"');
    $rawOcr = str_replace($search, $replace, $rawOcr);
    //Get rid of UTF-8 curly smart quotes and dashes
    $badwordchars = array("‘", "’", "“", "�", "—", "…");
    $fixedwordchars = array("'", "'", '"', '"', '-', '...');
    $rawOcr = str_replace($badwordchars, $fixedwordchars, $rawOcr);
    $dwcArr = $nlpManager->parse($rawOcr);
    if ($debug) {
        $fh = fopen($serverRoot . '/temp/logs/ocrdebug.txt', 'w');
        fwrite($fh, 'Raw OCR:');
        fwrite($fh, $rawOcr);
        fwrite($fh, "\n\n\n------------------------------------------------------------------\n\n\n");
        fwrite($fh, 'Parsed data:');
        foreach ($dwcArr as $k => $v) {
            fwrite($fh, $k . ': ' . $v . "\n");
        }