public function run($args) { define("VERYFAST", true); if (count($args) == 0) { die("./yii recalc_documents [Dokument-ID|alle]\n"); } if ($args[0] == "alle") { $sql = Yii::app()->db->createCommand(); $sql->select("id")->from("dokumente")->where("id >= 579866")->order("id"); $data = $sql->queryColumn(["id"]); } else { $data = [IntVal($args[0])]; } $anz = count($data); foreach ($data as $nr => $dok_id) { echo "{$nr} / {$anz} => {$dok_id}\n"; /** @var Dokument $dokument */ $dokument = Dokument::model()->findByPk($dok_id); if (!$dokument) { continue; } $dokument->download_if_necessary(); $dokument->geo_extract(); $absolute_filename = $dokument->getLocalPath(); $metadata = RISPDF2Text::document_pdf_metadata($absolute_filename); $dokument->seiten_anzahl = $metadata["seiten"]; $dokument->datum_dokument = $metadata["datum"]; $dokument->save(); echo $dokument->id . " => " . $dokument->seiten_anzahl . " / " . $dokument->datum_dokument . "\n"; } }
/** */ public function download_and_parse() { $this->download_if_necessary(); $absolute_filename = $this->getLocalPath(); if (!file_exists($absolute_filename)) { echo "Not Found: " . $this->id . "\n"; return; } $y = explode(".", $this->url); $endung = mb_strtolower($y[count($y) - 1]); $metadata = RISPDF2Text::document_pdf_metadata($absolute_filename); $this->seiten_anzahl = $metadata["seiten"]; $this->datum_dokument = $metadata["datum"]; if ($this->datum_dokument == "") { $this->datum_dokument = null; } if ($endung == "pdf") { $this->text_pdf = RISPDF2Text::document_text_pdf($absolute_filename); } else { $this->text_pdf = ""; } $this->text_ocr_raw = RISPDF2Text::document_text_ocr($absolute_filename, $this->seiten_anzahl); $this->text_ocr_corrected = RISPDF2Text::ris_ocr_clean($this->text_ocr_raw); $this->ocr_von = Dokument::$OCR_VON_TESSERACT; copy($absolute_filename, OMNIPAGE_PDF_DIR . $this->id . "." . $endung); }
public function parseArchive1($jahr) { $dir = PATH_PDF_RU . $jahr . "/"; if ($dh = opendir($dir)) { while (($file = readdir($dh)) !== false) { if (is_file($dir . $file) && $file > 0) { $content = RISPDF2Text::document_text_pdf($dir . $file); preg_match("/(?<tag>[0-9]+)\\. (?<monat>Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember) {$jahr}/siu", $content, $datum); if (!isset($datum["monat"])) { continue; } $ru = Rathausumschau::model()->findByAttributes(["jahr" => $jahr, "nr" => IntVal($file)]); if (!$ru) { $ru = new Rathausumschau(); $ru->nr = IntVal($file); $ru->url = $file; $ru->jahr = $jahr; $ru->datum = $jahr . "-" . static::$MONAT_MAPPING[$datum["monat"]] . "-" . $datum["tag"]; $ru->save(); } $this->parse($ru->id); } } closedir($dh); } }