/** * Extract HTML from XHTML container. * * @param \DOMElement $element * * @return string * * @since 1.0 */ public static function extract(\DOMElement $element) { $doc = new \DOMDocument('1.0', 'utf-8'); $imported = $doc->importNode($element, true); $doc->appendChild($imported); $prefix = $doc->lookupPrefix('http://www.w3.org/1999/xhtml'); if ('' !== $prefix) { $prefix .= ':'; } $patterns = ['/<\\?xml[^<]*>[^<]*<' . $prefix . 'div[^<]*/', '/<\\/' . $prefix . 'div>\\s*$/']; $text = preg_replace($patterns, '', $doc->saveXML()); if ('' !== $prefix) { $text = preg_replace('/(<[\\/]?)' . $prefix . '([a-zA-Z]+)/', '$1$2', $text); } return $text; }
/** * Get the entry content * * @return string */ public function getContent() { if (array_key_exists('content', $this->_data)) { return $this->_data['content']; } $content = null; $el = $this->getXpath()->query($this->getXpathPrefix() . '/atom:content'); if ($el->length > 0) { $el = $el->item(0); $type = $el->getAttribute('type'); switch ($type) { case '': case 'text': case 'text/plain': case 'html': case 'text/html': $content = $el->nodeValue; break; case 'xhtml': $this->getXpath()->registerNamespace('xhtml', 'http://www.w3.org/1999/xhtml'); $xhtml = $this->getXpath()->query($this->getXpathPrefix() . '/atom:content/xhtml:div')->item(0); //$xhtml->setAttribute('xmlns', 'http://www.w3.org/1999/xhtml'); $d = new DOMDocument('1.0', $this->getEncoding()); $xhtmls = $d->importNode($xhtml, true); $d->appendChild($xhtmls); $content = $this->_collectXhtml($d->saveXML(), $d->lookupPrefix('http://www.w3.org/1999/xhtml')); break; } } //var_dump($content); exit; if (!$content) { $content = $this->getDescription(); } $this->_data['content'] = trim($content); return $this->_data['content']; }
/** * Process external references from a HTML to the book. The chapter itself is not stored. * the HTML is scanned for <link..., <style..., and <img tags. * Embedded CSS styles and links will also be processed. * Script tags are not processed, as scripting should be avoided in e-books. * * EPub keeps track of added files, and duplicate files referenced across multiple * chapters, are only added once. * * If the $doc is a string, it is assumed to be the content of an HTML file, * else is it assumes to be a DOMDocument. * * Basedir is the root dir the HTML is supposed to "live" in, used to resolve * relative references such as <code><img src="../images/image.png"/></code> * * $externalReferences determins how the function will handle external references. * * @param mixed $doc (referenced) * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. * @param String $baseDir Default is "", meaning it is pointing to the document root. * @param String $htmlDir The path to the parent HTML file's directory from the root of the archive. * * @return Bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). */ protected function processChapterExternalReferences(&$doc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "") { if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { return FALSE; } $backPath = preg_replace('#[^/]+/#i', "../", $htmlDir); $isDocAString = is_string($doc); $xmlDoc = NULL; if ($isDocAString) { $xmlDoc = new DOMDocument(); @$xmlDoc->loadHTML($doc); } else { $xmlDoc = $doc; } $this->processChapterStyles($xmlDoc, $externalReferences, $baseDir, $htmlDir); $this->processChapterLinks($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath); $this->processChapterImages($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath); if ($isDocAString) { $html = $xmlDoc->saveXML(); $head = $xmlDoc->getElementsByTagName("head"); $body = $xmlDoc->getElementsByTagName("body"); $xml = new DOMDocument('1.0', "utf-8"); $xml->lookupPrefix("http://www.w3.org/1999/xhtml"); $xml->preserveWhiteSpace = FALSE; $xml->formatOutput = TRUE; $xml2Doc = new DOMDocument('1.0', "utf-8"); $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml"); $xml2Doc->loadXML("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n\t\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n</html>\n"); $html = $xml2Doc->getElementsByTagName("html")->item(0); $html->appendChild($xml2Doc->importNode($head->item(0), TRUE)); $html->appendChild($xml2Doc->importNode($body->item(0), TRUE)); // force pretty printing and correct formatting, should not be needed, but it is. $xml->loadXML($xml2Doc->saveXML()); $doc = $xml->saveXML(); } return TRUE; }
/** * Split $chapter into multiple parts. * * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given * * @param String $chapter XHTML file * @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check. * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern. * * @return array with 1 or more parts */ function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') { $chapterData = array(); $isSearchRegexp = $splitOnSearchString && preg_match('#^(\\D|\\S|\\W).+\\1[imsxeADSUXJu]*$#m', $searchString) == 1; if ($splitOnSearchString && !$isSearchRegexp) { $searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#"; } if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) { return array($chapter); } $xmlDoc = new DOMDocument(); @$xmlDoc->loadHTML($chapter); $head = $xmlDoc->getElementsByTagName("head"); $body = $xmlDoc->getElementsByTagName("body"); $htmlPos = stripos($chapter, "<html"); $htmlEndPos = stripos($chapter, ">", $htmlPos); $newXML = substr($chapter, 0, $htmlEndPos + 1) . "\n</html>"; $headerLength = strlen($newXML); $files = array(); $chapterNames = array(); $domDepth = 0; $domPath = array(); $domClonedPath = array(); $curFile = $xmlDoc->createDocumentFragment(); $files[] = $curFile; $curParent = $curFile; $curSize = 0; $bodyLen = strlen($xmlDoc->saveXML($body->item(0))); $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength; $partSize = $this->splitDefaultSize - $headLen; if ($bodyLen > $partSize) { $parts = ceil($bodyLen / $partSize); $partSize = $bodyLen / $parts - $headLen; } $node = $body->item(0)->firstChild; do { $nodeData = $xmlDoc->saveXML($node); $nodeLen = strlen($nodeData); if ($nodeLen > $partSize && $node->hasChildNodes()) { $domPath[] = $node; $domClonedPath[] = $node->cloneNode(false); $domDepth++; $node = $node->firstChild; } $node2 = $node->nextSibling; if ($node != null && $node->nodeName != "#text") { $doSplit = false; if ($splitOnSearchString) { $doSplit = preg_match($searchString, $nodeData) == 1; if ($doSplit) { $chapterNames[] = trim($nodeData); } } if ($curSize > 0 && ($doSplit || !$splitOnSearchString && $curSize + $nodeLen > $partSize)) { $curFile = $xmlDoc->createDocumentFragment(); $files[] = $curFile; $curParent = $curFile; if ($domDepth > 0) { reset($domPath); reset($domClonedPath); $oneDomClonedPath = each($domClonedPath); while ($oneDomClonedPath) { list($k, $v) = $oneDomClonedPath; $newParent = $v->cloneNode(false); $curParent->appendChild($newParent); $curParent = $newParent; $oneDomClonedPath = each($domClonedPath); } } $curSize = strlen($xmlDoc->saveXML($curFile)); } $curParent->appendChild($node->cloneNode(true)); $curSize += $nodeLen; } $node = $node2; while ($node == null && $domDepth > 0) { $domDepth--; $node = end($domPath)->nextSibling; array_pop($domPath); array_pop($domClonedPath); $curParent = $curParent->parentNode; } } while ($node != null); $curFile = null; $curSize = 0; $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding); $xml->lookupPrefix("http://www.w3.org/1999/xhtml"); $xml->preserveWhiteSpace = false; $xml->formatOutput = true; for ($idx = 0; $idx < count($files); $idx++) { $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding); $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml"); $xml2Doc->loadXML($newXML); $html = $xml2Doc->getElementsByTagName("html")->item(0); $html->appendChild($xml2Doc->importNode($head->item(0), true)); $body = $xml2Doc->createElement("body"); $html->appendChild($body); $body->appendChild($xml2Doc->importNode($files[$idx], true)); // force pretty printing and correct formatting, should not be needed, but it is. $xml->loadXML($xml2Doc->saveXML()); $chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $xml->saveXML(); } return $chapterData; }
/** * Split $chapter into multiple parts. * * @param $chapter * @return array with 1 or more parts */ function splitChapter($chapter) { $chapterData = array(); if (strlen($chapter) <= $splitDefaultSize) { $chapterData[] = $chapter; return $chapterData; } $xmlDoc = new DOMDocument(); $xmlDoc->loadHTML($chapter); $head = $xmlDoc->getElementsByTagName("head"); $body = $xmlDoc->getElementsByTagName("body"); $htmlPos = stripos($chapter, "<html"); $htmlEndPos = stripos($chapter, ">", $htmlPos); $newXML = substr($chapter, 0, $htmlEndPos + 1) . "\n</html>"; $headerLength = strlen($newXML); $files = array(); $domDepth = 0; $domPath = array(); $domClonedPath = array(); $curFile = $xmlDoc->createDocumentFragment(); $files[] = $curFile; $curParent = $curFile; $curSize = 0; $bodyLen = strlen($xmlDoc->saveXML($body->item(0))); $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength; $partSize = $this->splitDefaultSize - $headLen; if ($bodyLen > $partSize) { $parts = ceil($bodyLen / $partSize); $partSize = $bodyLen / $parts - $headLen; } $node = $body->item(0)->firstChild; do { $nodeData = $xmlDoc->saveXML($node); $nodeLen = strlen($nodeData); if ($nodeLen > $partSize && $node->hasChildNodes()) { $domPath[] = $node; $domClonedPath[] = $node->cloneNode(false); $domDepth++; $node = $node->firstChild; } $node2 = $node->nextSibling; if ($node != null && $node->nodeName != "#text") { if ($curSize > 0 && $curSize + $nodeLen > $partSize) { $curFile = $xmlDoc->createDocumentFragment(); $files[] = $curFile; $curParent = $curFile; if ($domDepth > 0) { reset($domPath); reset($domClonedPath); while (list($k, $v) = each($domClonedPath)) { $newParent = $v->cloneNode(false); $curParent->appendChild($newParent); $curParent = $newParent; } } $curSize = strlen($xmlDoc->saveXML($curFile)); } $curParent->appendChild($node->cloneNode(true)); $curSize += $nodeLen; } $node = $node2; while ($node == null && $domDepth > 0) { $domDepth--; $node = end($domPath)->nextSibling; array_pop($domPath); array_pop($domClonedPath); $curParent = $curParent->parentNode; } } while ($node != null); $curFile = null; $curSize = 0; $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding); $xml->lookupPrefix("http://www.w3.org/1999/xhtml"); $xml->preserveWhiteSpace = false; $xml->formatOutput = true; for ($idx = 0; $idx < count($files); $idx++) { $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding); $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml"); $xml2Doc->loadXML($newXML); $html = $xml2Doc->getElementsByTagName("html")->item(0); $html->appendChild($xml2Doc->importNode($head->item(0), true)); $body = $xml2Doc->createElement("body"); $html->appendChild($body); $body->appendChild($xml2Doc->importNode($files[$idx], true)); // force pretty printing and correct formatting, should not be needed, but it is. $xml->loadXML($xml2Doc->saveXML()); $chapterData[] = $xml->saveXML(); } return $chapterData; }
<?php $doc = new DOMDocument(); $doc->load(dirname(__FILE__) . "/nsdoc.xml"); $root = $doc->documentElement; $duri = $doc->lookupNamespaceURI("ns2") . "\n"; $euri = $root->lookupNamespaceURI("ns2") . "\n"; var_dump($duri == $euri); $dpref = $doc->lookupPrefix("http://ns2") . "\n"; $epref = $root->lookupPrefix("http://ns2") . "\n"; var_dump($dpref == $epref); $disdef = $doc->isDefaultNamespace("http://ns") . "\n"; $eisdef = $root->isDefaultNamespace("http://ns") . "\n"; var_dump($dpref === $epref);