Ejemplo n.º 1
0
 /**
  */
 public function download_and_parse()
 {
     $this->download_if_necessary();
     $absolute_filename = $this->getLocalPath();
     if (!file_exists($absolute_filename)) {
         echo "Not Found: " . $this->id . "\n";
         return;
     }
     $y = explode(".", $this->url);
     $endung = mb_strtolower($y[count($y) - 1]);
     $metadata = RISPDF2Text::document_pdf_metadata($absolute_filename);
     $this->seiten_anzahl = $metadata["seiten"];
     $this->datum_dokument = $metadata["datum"];
     if ($this->datum_dokument == "") {
         $this->datum_dokument = null;
     }
     if ($endung == "pdf") {
         $this->text_pdf = RISPDF2Text::document_text_pdf($absolute_filename);
     } else {
         $this->text_pdf = "";
     }
     $this->text_ocr_raw = RISPDF2Text::document_text_ocr($absolute_filename, $this->seiten_anzahl);
     $this->text_ocr_corrected = RISPDF2Text::ris_ocr_clean($this->text_ocr_raw);
     $this->ocr_von = Dokument::$OCR_VON_TESSERACT;
     copy($absolute_filename, OMNIPAGE_PDF_DIR . $this->id . "." . $endung);
 }