/** */ public function download_and_parse() { $this->download_if_necessary(); $absolute_filename = $this->getLocalPath(); if (!file_exists($absolute_filename)) { echo "Not Found: " . $this->id . "\n"; return; } $y = explode(".", $this->url); $endung = mb_strtolower($y[count($y) - 1]); $metadata = RISPDF2Text::document_pdf_metadata($absolute_filename); $this->seiten_anzahl = $metadata["seiten"]; $this->datum_dokument = $metadata["datum"]; if ($this->datum_dokument == "") { $this->datum_dokument = null; } if ($endung == "pdf") { $this->text_pdf = RISPDF2Text::document_text_pdf($absolute_filename); } else { $this->text_pdf = ""; } $this->text_ocr_raw = RISPDF2Text::document_text_ocr($absolute_filename, $this->seiten_anzahl); $this->text_ocr_corrected = RISPDF2Text::ris_ocr_clean($this->text_ocr_raw); $this->ocr_von = Dokument::$OCR_VON_TESSERACT; copy($absolute_filename, OMNIPAGE_PDF_DIR . $this->id . "." . $endung); }
public function parseArchive1($jahr) { $dir = PATH_PDF_RU . $jahr . "/"; if ($dh = opendir($dir)) { while (($file = readdir($dh)) !== false) { if (is_file($dir . $file) && $file > 0) { $content = RISPDF2Text::document_text_pdf($dir . $file); preg_match("/(?<tag>[0-9]+)\\. (?<monat>Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember) {$jahr}/siu", $content, $datum); if (!isset($datum["monat"])) { continue; } $ru = Rathausumschau::model()->findByAttributes(["jahr" => $jahr, "nr" => IntVal($file)]); if (!$ru) { $ru = new Rathausumschau(); $ru->nr = IntVal($file); $ru->url = $file; $ru->jahr = $jahr; $ru->datum = $jahr . "-" . static::$MONAT_MAPPING[$datum["monat"]] . "-" . $datum["tag"]; $ru->save(); } $this->parse($ru->id); } } closedir($dh); } }