/** * Primary processing routine for the source */ private function processBook() { // Preset collectors $pageContents = array(); $pageCssContents = array(); // Work through all source files foreach ($this->sourceFiles as $sourceFile) { // Init Apache POI $converter = new XWPFToHTMLConverter($this->workingDir, $this->progress); if (!$converter) { throw new Exception('[WordProcessor::routine] ' . 'Book ID ' . $this->bookId . ' cannot be processed as a working directory cannot be found.'); } // Set docx file to parse $converter->setDocFileToParse($sourceFile); // Convert everything to HTML $converter->convertToHTML(); // Get HTML pages $pages = $converter->getHTMLPages(); $this->progress->adjustMaxSteps(count($pages) * 2 + count($this->sourceAssets) + 1); foreach ($pages as $key => $page) { $pages[$key]->setStyleInline(false); $pageContents[] = $pages[$key]->getBodyHTML(); $this->progress->incrementStep(); } // Get CSS $pageCssContents[] = $converter->mainStyleSheet->getPagesCSS(); // Save image assets $this->createImageAssets(); } // Save page HTML foreach ($pageContents as $page => $contents) { $this->savePageHTML($pageContents[$page], $page + 1); $this->savePageCSS('', $page + 1); $this->progress->incrementStep(); } // Save book CSS $this->saveBookCSS("body {margin: 0px; padding: 0px;} \n\n" . implode("\n", $pageCssContents)); $this->progress->incrementStep(); $this->setCoverImage(); $numPages = count($pages); $this->importSourceAssets(); $this->updatePageCount($numPages); return true; }
/** * @return HTMLElement */ public function parseRun() { // Get and process pictures if there are any $pictures = java_values($this->run->getEmbeddedPictures()); if (count($pictures) > 0) { foreach ($pictures as $key => $picture) { $path = XWPFToHTMLConverter::getDirectoryPath(); $pictureContainer = new XWPFPicture($picture, $this->mainStyleSheet, $path); $container = $pictureContainer->processPicture(); return $container; } } // Character parser $runContainer = $this->getSubscript() != 1 ? $this->selectSubscriptContainer() : new HTMLElement(HTMLElement::SPAN); $text = $this->getText(); $addNewLine = strlen($text) == 1 && (substr($text, -1, 1) == "\r" || ord(substr($text, -1, 1)) == HWPFWrapper::BEL_MARK) ? true : false; if ($addNewLine) { $text .= '<br />'; } $runStyle = $this->processRunStyle($this->run); $runContainer->setInnerText($text); $runContainer->setClass($runStyle); return $runContainer; }
<form> <input type="text" id="file" name="file" /> <input type="submit" title="process" /> </form> </div> <?php include_once 'Word/XWPFToHTMLConverter.php'; //Set document directory $progress = ""; $workingDir = "/home/peter/Documents"; $sourceFile = "/home/peter/Documents/Strikethrough.docx"; //Initiate time counter $start = microtime(true); // Init Apache POI $converter = new XWPFToHTMLConverter($workingDir, $progress); if (!$converter) { throw new Exception('[WordProcessor::routine] ' . 'Book ID ' . $this->bookId . ' cannot be processed as a working directory cannot be found.'); } // Set docx file to parse $converter->setDocFileToParse($sourceFile); // Convert everything to HTML $converter->convertToHTML(); $hasToc = $converter->hasTOC(); if ($hasToc) { $tocNumbering = $converter->getTocNumbering(); $TOC = $converter->getTableOfContents(); // foreach($TOC as $entry){ // if(strlen($entry['num']) == 0){ // var_dump($entry); // }
/** * Parses a characterRun element of HWPF document * @param object Character run * @return HTMLElement */ private function parseCharacterRun($characterRun) { // Create null container $container = null; // Even non text elements will have a text property $text = nl2br(java_values($characterRun->getText(0))); // Get and process pictures if there are any $pictures = java_values($characterRun->getEmbeddedPictures()); if (count($pictures) > 0) { foreach ($pictures as $key => $picture) { $path = XWPFToHTMLConverter::getDirectoryPath(); $pictureContainer = new XWPFPicture($picture, $this->mainStyleSheet, $path); $container = $pictureContainer->processPicture(); return $container; } } //Get Run xml $charXml = java_values($characterRun->getCTR()->ToString()); //Check for section numbering $this->checkSectionNumbering($characterRun, $charXml); $charXml = str_replace('w:', 'w', $charXml); $xml = new SimpleXMLElement($charXml); //Get the value of hyperlink $link = $xml->xpath("wrPr/wrStyle"); //Check if the hyperlink xml tag exists if (!empty($link)) { $linkValue = $link[0]['wval']; } else { $linkValue = "none"; } //Check if is valid internet link if ($linkValue == 'InternetLink') { //Create empty A tag for the hyperlink and the text $container = new HTMLElement(HTMLElement::A); $container->setAttribute('href', java_values($characterRun->getText(0))); } else { /* In every other case, if we got here we do simple text parsing */ // Create empty text element $container = new HTMLElement(HTMLElement::SPAN); } $styleClass = $this->processCharacterRunStyle($characterRun, $xml); if ($container->getTagName() != 'a') { $container = new HTMLElement(HTMLElement::SPAN); } $addNewLine = false; // Check for new line if (strlen($text) == 1 && (substr($text, -1, 1) == "\r" || ord(substr($text, -1, 1)) == HWPFWrapper::BEL_MARK)) { $addNewLine = true; } //escape text for xhtml $text = XhtmlEntityConverter::convertToNumericalEntities(htmlentities($text, ENT_COMPAT | ENT_XHTML)); if ($addNewLine) { $text .= '<br />'; } $boldContainer = $styleClass['containers']['bold']; $italicContainer = $styleClass['containers']['italic']; //TODO check why this fails with large documents //if($container->getTagName() == '') $container->setInnerElement($text); if ($boldContainer and $italicContainer) { //Set Bold and italic semantic tags $container->addInnerElement($boldContainer); $boldContainer->addInnerElement($italicContainer); $italicContainer->setInnerText($text); } elseif ($boldContainer) { //Set bold strong tag $container->addInnerElement($boldContainer); $boldContainer->setInnerText($text); } elseif ($italicContainer) { // Set italic em tag $container->addInnerElement($italicContainer); $italicContainer->setInnerText($text); } else { // Set inner text to span tag $container->setInnerText($text); } // Get and set class name on container $container->setClass($styleClass['style'] . ' textframe cke_focus'); // Return container return $container; }