/** * Generate document * * @param Job $job * @return Job $job */ public function process(Job $job) { $unoconv = $this->sm->get('DocxConversion\\Model\\Converter\\Unoconv'); // Fetch the document to convert $unconvertedDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_WP_IN); if (!$unconvertedDocument) { throw new \Exception('Couldn\'t find the stage document'); } // Convert the document $unoconv->setFilter('docx7'); $unoconv->setInputFile($unconvertedDocument->path); $outputPath = $job->getDocumentPath() . '/document.docx'; $unoconv->setOutputFile($outputPath); $unoconv->convert(); if (!$unoconv->getStatus()) { $job->status = JOB_STATUS_FAILED; return $job; } $documentDAO = $this->sm->get('Manager\\Model\\DAO\\DocumentDAO'); $docxDocument = $documentDAO->getInstance(); $docxDocument->path = $outputPath; $docxDocument->mimeType = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'; $docxDocument->job = $job; $docxDocument->conversionStage = JOB_CONVERSION_STAGE_DOCX; $job->documents[] = $docxDocument; $job->conversionStage = JOB_CONVERSION_STAGE_DOCX; return $job; }
/** * Extract named entities * * @param Job $job * @return Job $job */ public function process(Job $job) { $extractor = $this->sm->get('NERExtraction\\Model\\Converter\\NERExtractor'); // fetch xml document $doc = $job->getStageDocument(JOB_CONVERSION_STAGE_XML_MERGE); if (!$doc) { throw new \Exception("Couldn't find the stage document"); } // generate named entities document $extractor->setInputFile($doc->path); $outputPath = $job->getDocumentPath() . '/named-entities.json'; $extractor->setOutputFile($outputPath); $extractor->convert(); if (!$extractor->getStatus()) { $job->status = JOB_STATUS_FAILED; return $job; } $documentDAO = $this->sm->get('Manager\\Model\\DAO\\DocumentDAO'); $jsonDocument = $documentDAO->getInstance(); $jsonDocument->path = $outputPath; $jsonDocument->mimeType = 'application/json'; $jsonDocument->job = $job; $jsonDocument->conversionStage = JOB_CONVERSION_STAGE_NER_EXTRACT; $job->documents[] = $jsonDocument; $job->conversionStage = JOB_CONVERSION_STAGE_NER_EXTRACT; return $job; }
/** * Parse Bibtex * * @param Job $job * @return Job $job */ public function process(Job $job) { $bibtex = $this->sm->get('BibtexConversion\\Model\\Converter\\Bibtex'); // Fetch the document to convert $referencesDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_REFERENCES); if (!$referencesDocument) { throw new \Exception('Couldn\'t find the stage document'); } // Parse the bibtex $outputFile = $job->getDocumentPath() . '/document.bib'; $bibtex->setInputFile($referencesDocument->path); $bibtex->setOutputFile($outputFile); $bibtex->convert(); $job->conversionStage = JOB_CONVERSION_STAGE_BIBTEX; if (!$bibtex->getStatus()) { $job->status = JOB_STATUS_FAILED; return $job; } $documentDAO = $this->sm->get('Manager\\Model\\DAO\\DocumentDAO'); $bibtexDocument = $documentDAO->getInstance(); $bibtexDocument->path = $outputFile; $bibtexDocument->job = $job; $bibtexDocument->conversionStage = JOB_CONVERSION_STAGE_BIBTEX; $job->documents[] = $bibtexDocument; return $job; }
/** * Process job * * @param Job $job * @return Job $job */ public function process(Job $job) { if (!class_exists('\\ZipArchive')) { throw new \Exception('Zip extension is missing'); } $zipFile = $job->getDocumentPath() . '/documents.zip'; if (file_exists($zipFile)) { @unlink($zipFile); } $zip = new \ZipArchive(); if ($zip->open($zipFile, \ZipArchive::CREATE) !== TRUE) { throw new \Exception('Couldn\'t create zip archive'); } foreach ($job->getOutputDocuments() as $document) { $zip->addFile($document->path, str_replace($job->getDocumentPath() . '/', '', $document->path)); } $zip->close(); $documentDAO = $this->sm->get('Manager\\Model\\DAO\\DocumentDAO'); $docxDocument = $documentDAO->getInstance(); $docxDocument->path = $zipFile; $docxDocument->job = $job; $docxDocument->conversionStage = JOB_CONVERSION_STAGE_ZIP; $job->documents[] = $docxDocument; $job->conversionStage = JOB_CONVERSION_STAGE_ZIP; return $job; }
/** * Parse references * * @param Job $job * @return Job $job */ public function process(Job $job) { $references = $this->sm->get('ReferencesConversion\\Model\\Converter\\References'); // Fetch the document to convert if ($job->inputFileFormat == JOB_INPUT_TYPE_PDF) { $xmlDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_PDF_EXTRACT); } else { $xmlDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_NLMXML); } if (!$xmlDocument) { throw new \Exception('Couldn\'t find the stage document'); } // Parse the references $outputFile = $job->getDocumentPath() . '/document.bib.xml'; $references->setInputFile($xmlDocument->path); $references->setOutputDirectory($job->getDocumentPath()); $references->setOutputFile($outputFile); $references->convert(); $job->conversionStage = JOB_CONVERSION_STAGE_REFERENCES; if (!$references->getStatus()) { $job->status = JOB_STATUS_FAILED; return $job; } $documentDAO = $this->sm->get('Manager\\Model\\DAO\\DocumentDAO'); $referenceXmlDocument = $documentDAO->getInstance(); $referenceXmlDocument->path = $outputFile; $referenceXmlDocument->job = $job; $referenceXmlDocument->conversionStage = JOB_CONVERSION_STAGE_REFERENCES; $job->documents[] = $referenceXmlDocument; // Flag the reference parsing as successful. This will // influence which conversion steps will be executed. $job->referenceParsingSuccess = true; return $job; }
/** * Convert citation style * * @param Job $job * @return Job $job */ public function process(Job $job) { $citationstyle = $this->sm->get('CitationstyleConversion\\Model\\Converter\\Pandoc'); // Fetch the NLMXML document; if the references step failed fall back to // the NLMXML document before the references conversion took place if (!($documentNlmxml = $job->getStageDocument(JOB_CONVERSION_STAGE_BIBTEXREFERENCES))) { !($documentNlmxml = $job->getStageDocument(JOB_CONVERSION_STAGE_NLMXML)); } if (!$documentNlmxml) { throw new \Exception('Couldn\'t find the NLMXML stage document'); } // Fetch the Bibtex document if (!($documentBibtex = $job->getStageDocument(JOB_CONVERSION_STAGE_BIBTEX))) { throw new \Exception('Couldn\'t find the Bibtex stage document'); } // Fetch the Html document if (!($documentHtml = $job->getStageDocument(JOB_CONVERSION_STAGE_HTML))) { throw new \Exception('Couldn\'t find the HTML stage document'); } // Parse the citationstyle $citationstyle->setInputFileNlmxml($documentNlmxml->path); $citationstyle->setInputFileBibtex($documentBibtex->path); $citationstyle->setInputFileHtml($documentHtml->path); $citationstyle->setCitationStyleFile($job->citationStyleFile); $citationstyle->convert(); $job->conversionStage = JOB_CONVERSION_STAGE_CITATIONSTYLE; if (!$citationstyle->getStatus()) { $job->status = JOB_STATUS_FAILED; return $job; } // Update the conversion stage of the HTML document $documentHtml->conversionStage = $job->conversionStage; return $job; }
/** * Convert the NLMXML to Epub * * @param Job $job * @return Job $job */ public function process(Job $job) { $epub = $this->sm->get('EpubConversion\\Model\\Converter\\Epub'); // Fetch the NLMXML document resulting from merge, or from // reference correction, or as extracted from PDF, in order of // preference. $document = $job->getStageDocument(JOB_CONVERSION_STAGE_XML_MERGE); if (!$document) { $document = $job->getStageDocument(JOB_CONVERSION_STAGE_BIBTEXREFERENCES); } if (!$document) { $document = $job->getStageDocument(JOB_CONVERSION_STAGE_PDF_EXTRACT); } if (!$document) { throw new \Exception('Couldn\'t find the stage document'); } $outputFile = $job->getDocumentPath() . '/document.epub'; $epub->setInputFile($document->path); $epub->setOutputFile($outputFile); $epub->convert(); $job->conversionStage = JOB_CONVERSION_STAGE_EPUB; if (!$epub->getStatus()) { $job->status = JOB_STATUS_FAILED; return $job; } $documentDAO = $this->sm->get('Manager\\Model\\DAO\\DocumentDAO'); $epubDocument = $documentDAO->getInstance(); $epubDocument->path = $outputFile; $epubDocument->job = $job; $epubDocument->conversionStage = JOB_CONVERSION_STAGE_EPUB; $job->documents[] = $epubDocument; return $job; }
/** * Set flags and state depending on input * * @param Job $job * * @return Job $job * * @throws Exception if input stage document can’t be found */ public function process(Job $job) { // Fetch the initial input document. $unconvertedDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_UNCONVERTED); if (!$unconvertedDocument) { throw new \Exception("Couldn’t find the initial input document"); } $finfo = finfo_open(FILEINFO_MIME_TYPE); $mimeType = finfo_file($finfo, $unconvertedDocument->path); if ($mimeType == 'application/pdf') { $job->inputFileFormat = JOB_INPUT_TYPE_PDF; $job->conversionStage = JOB_CONVERSION_STAGE_PDF_IN; $unconvertedDocument->conversionStage = JOB_CONVERSION_STAGE_PDF_IN; } else { $job->inputFileFormat = JOB_INPUT_TYPE_WP; $job->conversionStage = JOB_CONVERSION_STAGE_WP_IN; $unconvertedDocument->conversionStage = JOB_CONVERSION_STAGE_WP_IN; } return $job; }
/** * Generate document * * @param Job $job * @return Job $job */ public function process(Job $job) { $metypeset = $this->sm->get('NlmxmlConversion\\Model\\Converter\\Metypeset'); // Fetch the document to convert $docxDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_DOCX); if (!$docxDocument) { throw new \Exception('Couldn\'t find the stage document'); } // Convert the document $metypeset->setInputFile($docxDocument->path); $outputDirectory = $job->getDocumentPath() . '/metypeset'; $metypeset->setOutputDirectory($outputDirectory); $metypeset->convert(); $xmlFile = $docxDocument->getFileName(true) . '.xml'; $meTypesetOutputPath = $outputDirectory . '/nlm/' . $xmlFile; if (!$metypeset->getStatus() or !file_exists($meTypesetOutputPath)) { $job->status = JOB_STATUS_FAILED; return $job; } $outputPath = $job->getDocumentPath() . '/document_metypeset.xml'; @copy($meTypesetOutputPath, $outputPath); $documentDAO = $this->sm->get('Manager\\Model\\DAO\\DocumentDAO'); $docxDocument = $documentDAO->getInstance(); $docxDocument->path = $outputPath; $docxDocument->job = $job; $docxDocument->conversionStage = JOB_CONVERSION_STAGE_NLMXML; $job->documents[] = $docxDocument; $job->conversionStage = JOB_CONVERSION_STAGE_NLMXML; return $job; }
/** * Extract content * * @param Job $job * @return Job $job */ public function process(Job $job) { $cermine = $this->sm->get('Cermine\\Model\\Converter\\Cermine'); // Fetch the document to convert if ($job->inputFileFormat == JOB_INPUT_TYPE_PDF) { $pdfDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_PDF_IN); } else { $pdfDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_WP_PDF); } if (!$pdfDocument) { throw new \Exception('Couldn\'t find the stage document'); } // Convert the document $cermine->setInputFile($pdfDocument->path); $outputPath = $job->getDocumentPath() . '/document_from_pdf.xml'; $cermine->setOutputFile($outputPath); $cermine->convert(); if (!$cermine->getStatus()) { $job->status = JOB_STATUS_FAILED; return $job; } $documentDAO = $this->sm->get('Manager\\Model\\DAO\\DocumentDAO'); $xmlDocument = $documentDAO->getInstance(); $xmlDocument->path = $outputPath; $xmlDocument->mimeType = 'application/xml'; $xmlDocument->job = $job; $xmlDocument->conversionStage = JOB_CONVERSION_STAGE_PDF_EXTRACT; $job->documents[] = $xmlDocument; $job->conversionStage = JOB_CONVERSION_STAGE_PDF_EXTRACT; return $job; }
/** * Create and add the XMP sidecar * * @param Job $job * @return Job $job */ public function process(Job $job) { $pdf = $this->sm->get('XmpConversion\\Model\\Converter\\Xmp'); // Fetch the NLMXML document; if the references step failed fall back to // the NLMXML document before the references conversion took place if (!($documentNlmxml = $job->getStageDocument(JOB_CONVERSION_STAGE_BIBTEXREFERENCES))) { !($documentNlmxml = $job->getStageDocument(JOB_CONVERSION_STAGE_NLMXML)); } if (!$documentNlmxml) { throw new \Exception('Couldn\'t find the NLMXML stage document'); } // Fetch the PDF document $documentPdf = $job->getStageDocument(JOB_CONVERSION_STAGE_PDF); if (!$documentPdf) { throw new \Exception('Couldn\'t find the PDF stage document'); } $pdf->setInputFileNlmxml($documentNlmxml->path); $pdf->setInputFilePdf($documentPdf->path); $pdf->convert(); $job->conversionStage = JOB_CONVERSION_STAGE_XMP; if (!$pdf->getStatus()) { $job->status = JOB_STATUS_FAILED; return $job; } // We didn't create a new document; just change the conversion stage of // the PDF document $documentPdf->conversionStage = JOB_CONVERSION_STAGE_XMP; return $job; }
/** * Convert the HTML to PDF * * @param Job $job * @return Job $job */ public function process(Job $job) { $pdf = $this->sm->get('PdfConversion\\Model\\Converter\\Pdf'); // Fetch the zip file containing the html; check if we got one that has // the citations converted first and fall back to unconverted HTML if (!($document = $job->getStageDocument(JOB_CONVERSION_STAGE_CITATIONSTYLE)) and !($document = $job->getStageDocument(JOB_CONVERSION_STAGE_HTML))) { throw new \Exception('Couldn\'t find the stage document'); } $outputFile = $job->getDocumentPath() . '/document.pdf'; $pdf->setInputFile($document->path); $pdf->setOutputFile($outputFile); $pdf->convert(); $job->conversionStage = JOB_CONVERSION_STAGE_PDF; if (!$pdf->getStatus()) { $job->status = JOB_STATUS_FAILED; return $job; } $documentDAO = $this->sm->get('Manager\\Model\\DAO\\DocumentDAO'); $pdfDocument = $documentDAO->getInstance(); $pdfDocument->path = $outputFile; $pdfDocument->job = $job; $pdfDocument->conversionStage = JOB_CONVERSION_STAGE_PDF; $job->documents[] = $pdfDocument; return $job; }
/** * Replace reference list * * @param Job $job * @return Job $job */ public function process(Job $job) { $bibtexreferences = $this->sm->get('BibtexreferencesConversion\\Model\\Converter\\Bibtexreferences'); // Fetch the NLMXML document in which the references will be // replaced. if ($job->inputFileFormat == JOB_INPUT_TYPE_PDF) { $xmlDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_PDF_EXTRACT); } else { $xmlDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_NLMXML); } if (!$xmlDocument) { throw new \Exception('Couldn\'t find the stage document'); } // Fetch the bibtex document which will be converted to NLMXML $bibtexDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_BIBTEX); if (!$bibtexDocument) { throw new \Exception('Couldn\'t find the stage document'); } // Do the reference list replacement $bibtexreferences->setInputFileNlmxml($xmlDocument->path); $bibtexreferences->setInputFileBibtex($bibtexDocument->path); $bibtexreferences->convert(); $job->conversionStage = JOB_CONVERSION_STAGE_BIBTEXREFERENCES; if (!$bibtexreferences->getStatus()) { $job->status = JOB_STATUS_FAILED; return $job; } // We didn't create a new document; just change the conversion stage $xmlDocument->conversionStage = JOB_CONVERSION_STAGE_BIBTEXREFERENCES; return $job; }
/** * Merge the XML outputs to one file. * * @param Job $job * @return Job $job */ public function process(Job $job) { $mergedXML = $this->sm->get('MergeXMLOutputs\\Model\\Converter\\Merge'); $cermineDocument = null; if ($job->inputFileFormat == JOB_INPUT_TYPE_PDF) { $cermineDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_BIBTEXREFERENCES); } if (!$cermineDocument) { $cermineDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_PDF_EXTRACT); } if (!$cermineDocument) { throw new \Exception('Couldn\'t find the CERMINE output'); } $outputFile = $job->getDocumentPath() . '/document.xml'; $job->conversionStage = JOB_CONVERSION_STAGE_XML_MERGE; if ($job->inputFileFormat == JOB_INPUT_TYPE_PDF) { @copy($cermineDocument->path, $outputFile); } else { if (!($meTypesetDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_BIBTEXREFERENCES))) { $meTypesetDocument = $job->getStageDocument(JOB_CONVERSION_STAGE_NLMXML); } if (!$meTypesetDocument) { throw new \Exception('Couldn\'t find the meTypeset output'); } $mergedXML->setInputFileNlmxml($meTypesetDocument->path); $mergedXML->setInputFileCermine($cermineDocument->path); $mergedXML->setOutputFile($outputFile); $mergedXML->convert(); if (!$mergedXML->getStatus()) { $job->status = JOB_STATUS_FAILED; return $job; } } $documentDAO = $this->sm->get('Manager\\Model\\DAO\\DocumentDAO'); $mergedXMLDocument = $documentDAO->getInstance(); $mergedXMLDocument->path = $outputFile; $mergedXMLDocument->job = $job; $mergedXMLDocument->conversionStage = JOB_CONVERSION_STAGE_XML_MERGE; $job->documents[] = $mergedXMLDocument; return $job; }