public function run($args) { define("VERYFAST", true); if (count($args) == 0) { die("./yii recalc_documents [Dokument-ID|alle]\n"); } if ($args[0] == "alle") { $sql = Yii::app()->db->createCommand(); $sql->select("id")->from("dokumente")->where("id >= 579866")->order("id"); $data = $sql->queryColumn(["id"]); } else { $data = [IntVal($args[0])]; } $anz = count($data); foreach ($data as $nr => $dok_id) { echo "{$nr} / {$anz} => {$dok_id}\n"; /** @var Dokument $dokument */ $dokument = Dokument::model()->findByPk($dok_id); if (!$dokument) { continue; } $dokument->download_if_necessary(); $dokument->geo_extract(); $absolute_filename = $dokument->getLocalPath(); $metadata = RISPDF2Text::document_pdf_metadata($absolute_filename); $dokument->seiten_anzahl = $metadata["seiten"]; $dokument->datum_dokument = $metadata["datum"]; $dokument->save(); echo $dokument->id . " => " . $dokument->seiten_anzahl . " / " . $dokument->datum_dokument . "\n"; } }
/** */ public function download_and_parse() { $this->download_if_necessary(); $absolute_filename = $this->getLocalPath(); if (!file_exists($absolute_filename)) { echo "Not Found: " . $this->id . "\n"; return; } $y = explode(".", $this->url); $endung = mb_strtolower($y[count($y) - 1]); $metadata = RISPDF2Text::document_pdf_metadata($absolute_filename); $this->seiten_anzahl = $metadata["seiten"]; $this->datum_dokument = $metadata["datum"]; if ($this->datum_dokument == "") { $this->datum_dokument = null; } if ($endung == "pdf") { $this->text_pdf = RISPDF2Text::document_text_pdf($absolute_filename); } else { $this->text_pdf = ""; } $this->text_ocr_raw = RISPDF2Text::document_text_ocr($absolute_filename, $this->seiten_anzahl); $this->text_ocr_corrected = RISPDF2Text::ris_ocr_clean($this->text_ocr_raw); $this->ocr_von = Dokument::$OCR_VON_TESSERACT; copy($absolute_filename, OMNIPAGE_PDF_DIR . $this->id . "." . $endung); }