Ejemplo n.º 1
0
 /**
  */
 public function download_and_parse()
 {
     $this->download_if_necessary();
     $absolute_filename = $this->getLocalPath();
     if (!file_exists($absolute_filename)) {
         echo "Not Found: " . $this->id . "\n";
         return;
     }
     $y = explode(".", $this->url);
     $endung = mb_strtolower($y[count($y) - 1]);
     $metadata = RISPDF2Text::document_pdf_metadata($absolute_filename);
     $this->seiten_anzahl = $metadata["seiten"];
     $this->datum_dokument = $metadata["datum"];
     if ($this->datum_dokument == "") {
         $this->datum_dokument = null;
     }
     if ($endung == "pdf") {
         $this->text_pdf = RISPDF2Text::document_text_pdf($absolute_filename);
     } else {
         $this->text_pdf = "";
     }
     $this->text_ocr_raw = RISPDF2Text::document_text_ocr($absolute_filename, $this->seiten_anzahl);
     $this->text_ocr_corrected = RISPDF2Text::ris_ocr_clean($this->text_ocr_raw);
     $this->ocr_von = Dokument::$OCR_VON_TESSERACT;
     copy($absolute_filename, OMNIPAGE_PDF_DIR . $this->id . "." . $endung);
 }
 public function parseArchive1($jahr)
 {
     $dir = PATH_PDF_RU . $jahr . "/";
     if ($dh = opendir($dir)) {
         while (($file = readdir($dh)) !== false) {
             if (is_file($dir . $file) && $file > 0) {
                 $content = RISPDF2Text::document_text_pdf($dir . $file);
                 preg_match("/(?<tag>[0-9]+)\\. (?<monat>Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember) {$jahr}/siu", $content, $datum);
                 if (!isset($datum["monat"])) {
                     continue;
                 }
                 $ru = Rathausumschau::model()->findByAttributes(["jahr" => $jahr, "nr" => IntVal($file)]);
                 if (!$ru) {
                     $ru = new Rathausumschau();
                     $ru->nr = IntVal($file);
                     $ru->url = $file;
                     $ru->jahr = $jahr;
                     $ru->datum = $jahr . "-" . static::$MONAT_MAPPING[$datum["monat"]] . "-" . $datum["tag"];
                     $ru->save();
                 }
                 $this->parse($ru->id);
             }
         }
         closedir($dh);
     }
 }