public function run($args)
 {
     define("VERYFAST", true);
     if (count($args) == 0) {
         die("./yii recalc_documents [Dokument-ID|alle]\n");
     }
     if ($args[0] == "alle") {
         $sql = Yii::app()->db->createCommand();
         $sql->select("id")->from("dokumente")->where("id >= 579866")->order("id");
         $data = $sql->queryColumn(["id"]);
     } else {
         $data = [IntVal($args[0])];
     }
     $anz = count($data);
     foreach ($data as $nr => $dok_id) {
         echo "{$nr} / {$anz} => {$dok_id}\n";
         /** @var Dokument $dokument */
         $dokument = Dokument::model()->findByPk($dok_id);
         if (!$dokument) {
             continue;
         }
         $dokument->download_if_necessary();
         $dokument->geo_extract();
         $absolute_filename = $dokument->getLocalPath();
         $metadata = RISPDF2Text::document_pdf_metadata($absolute_filename);
         $dokument->seiten_anzahl = $metadata["seiten"];
         $dokument->datum_dokument = $metadata["datum"];
         $dokument->save();
         echo $dokument->id . " => " . $dokument->seiten_anzahl . " / " . $dokument->datum_dokument . "\n";
     }
 }
Ejemplo n.º 2
0
 /**
  */
 public function download_and_parse()
 {
     $this->download_if_necessary();
     $absolute_filename = $this->getLocalPath();
     if (!file_exists($absolute_filename)) {
         echo "Not Found: " . $this->id . "\n";
         return;
     }
     $y = explode(".", $this->url);
     $endung = mb_strtolower($y[count($y) - 1]);
     $metadata = RISPDF2Text::document_pdf_metadata($absolute_filename);
     $this->seiten_anzahl = $metadata["seiten"];
     $this->datum_dokument = $metadata["datum"];
     if ($this->datum_dokument == "") {
         $this->datum_dokument = null;
     }
     if ($endung == "pdf") {
         $this->text_pdf = RISPDF2Text::document_text_pdf($absolute_filename);
     } else {
         $this->text_pdf = "";
     }
     $this->text_ocr_raw = RISPDF2Text::document_text_ocr($absolute_filename, $this->seiten_anzahl);
     $this->text_ocr_corrected = RISPDF2Text::ris_ocr_clean($this->text_ocr_raw);
     $this->ocr_von = Dokument::$OCR_VON_TESSERACT;
     copy($absolute_filename, OMNIPAGE_PDF_DIR . $this->id . "." . $endung);
 }
 public function parseArchive1($jahr)
 {
     $dir = PATH_PDF_RU . $jahr . "/";
     if ($dh = opendir($dir)) {
         while (($file = readdir($dh)) !== false) {
             if (is_file($dir . $file) && $file > 0) {
                 $content = RISPDF2Text::document_text_pdf($dir . $file);
                 preg_match("/(?<tag>[0-9]+)\\. (?<monat>Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember) {$jahr}/siu", $content, $datum);
                 if (!isset($datum["monat"])) {
                     continue;
                 }
                 $ru = Rathausumschau::model()->findByAttributes(["jahr" => $jahr, "nr" => IntVal($file)]);
                 if (!$ru) {
                     $ru = new Rathausumschau();
                     $ru->nr = IntVal($file);
                     $ru->url = $file;
                     $ru->jahr = $jahr;
                     $ru->datum = $jahr . "-" . static::$MONAT_MAPPING[$datum["monat"]] . "-" . $datum["tag"];
                     $ru->save();
                 }
                 $this->parse($ru->id);
             }
         }
         closedir($dh);
     }
 }