/** * Filter XHtml document * * Filter for the document, which may modify / restructure a document and * assign semantic information bits to the elements in the tree. * * @param DOMDocument $document * @return DOMDocument */ public function filter(DOMDocument $document) { $xpath = new DOMXPath($document); $body = $xpath->query('/*[local-name() = "html"]/*[local-name() = "body"]')->item(0); $this->calculateContentFactors($body); if ($this->mostImportantNode !== false) { // Replace contents of body node with the found "most important" // section, so we keep the metadata, but omit everything we consider as // layout. $contentNode = $this->mostImportantNode->cloneNode(true); // Remove all childs from HTML body for ($i = $body->childNodes->length - 1; $i >= 0; --$i) { $body->removeChild($body->childNodes->item($i)); } // Readd detected content node $body->appendChild($contentNode); } }