public function find($query, $index = null) { if (is_null($this->dom)) { $this->dom = HtmlDomParser::str_get_html($this->html); } $this->dom->find($query, $index); }
/** * @param HtmlDomParser $newDocument * * @return HtmlDomParser */ protected function cleanHtmlWrapper(HtmlDomParser $newDocument) { if ($newDocument->getIsDOMDocumentCreatedWithoutHtml() === true) { // Remove doc-type node. $newDocument->getDocument()->doctype->parentNode->removeChild($newDocument->getDocument()->doctype); // Remove html element, preserving child nodes. $html = $newDocument->getDocument()->getElementsByTagName('html')->item(0); $fragment = $newDocument->getDocument()->createDocumentFragment(); while ($html->childNodes->length > 0) { $fragment->appendChild($html->childNodes->item(0)); } $html->parentNode->replaceChild($fragment, $html); // Remove body element, preserving child nodes. $body = $newDocument->getDocument()->getElementsByTagName('body')->item(0); $fragment = $newDocument->getDocument()->createDocumentFragment(); while ($body->childNodes->length > 0) { $fragment->appendChild($body->childNodes->item(0)); } $body->parentNode->replaceChild($fragment, $body); // At this point DOMDocument still added a "<p>"-wrapper around our string, // so we replace it with "<simpleHtmlDomP>" and delete this at the ending ... $this->changeElementName($newDocument->getDocument()->getElementsByTagName('p')->item(0), 'simpleHtmlDomP'); } return $newDocument; }
protected function compare() { $bench = new Ubench(); $url = 'tests/templated-retrospect/index.html'; $file = 'test.html'; if (!file_exists($file)) { $htmlstr = file_get_contents($url); file_put_contents($file, $htmlstr); } $htmlstr = file_get_contents($file); $this->log('', true); $this->log('Measuring Simple HTML DOM Parser...'); $resultsSimpleHtmlDomParser = $bench->run(function ($htmlstr) { $results = []; $html = HtmlDomParser::str_get_html($htmlstr); $html->find('title', 0)->innertext('New Title'); $results[1] = $html->__toString(); $tpl = HtmlDomParser::str_get_html(file_get_contents('tests/templated-retrospect/index.html')); foreach ($tpl->find('link') as $elem) { $elem->href = '//localhost/xparser/tests/templated-retrospect/' . $elem->href; } foreach ($tpl->find('img, script') as $elem) { $elem->src = '//localhost/xparser/tests/templated-retrospect/' . $elem->src; } $results[2] = $tpl->__toString(); return $results; }, $htmlstr); //$this->log('distance: ' . similar_text($htmlstr, $result)); $this->logBench($bench); $this->log('', true); $this->log('Measuring XParser...'); $resultsXParser = $bench->run(function ($htmlstr) { $results = []; $html = new XNode($htmlstr); $html->find('title')->inner('New Title'); $results[1] = $html->__toString(); $tpl = new XNode(file_get_contents('tests/templated-retrospect/index.html')); foreach ($tpl('link') as $elem) { $elem->href = '//localhost/xparser/tests/templated-retrospect/' . $elem->href; } foreach ($tpl('img, script') as $elem) { $elem->src = '//localhost/xparser/tests/templated-retrospect/' . $elem->src; } $results[2] = $tpl->__toString(); return $results; }, $htmlstr); //$this->log('distance: ' . similar_text($htmlstr, $result)); $this->logBench($bench); $this->log('', true); $this->log('Measuring Ganon...'); $resultsGanon = $bench->run(function ($htmlstr) { $html = str_get_dom($htmlstr); foreach ($html('title') as $title) { $title->setInnerText('New Title'); } $results[1] = $html->__toString(); $tpl = new XNode(file_get_contents('tests/templated-retrospect/index.html')); foreach ($tpl('link') as $elem) { $elem->href = '//localhost/xparser/tests/templated-retrospect/' . $elem->href; } foreach ($tpl('img, script') as $elem) { $elem->src = '//localhost/xparser/tests/templated-retrospect/' . $elem->src; } $results[2] = $tpl->__toString(); return $results; }, $htmlstr); //$this->log('distance: ' . similar_text($htmlstr, $result)); $this->logBench($bench); $this->log('', true); $this->log('Symfony CSS Selector combined with DOMDocument and DOMXPath...'); $resultsXParser = $bench->run(function ($htmlstr) { $results = []; $html = new DOMDocument(); libxml_use_internal_errors(true); $html->loadHTML($htmlstr); $converter = new CssSelectorConverter(); $xpath = new DOMXPath($html); $elements = $xpath->query($converter->toXPath('title')); foreach ($elements as $element) { $element->innserHTML = 'New Title'; } $results[1] = $html->saveHTML(); $tpl = new DOMDocument(); $tpl->load('tests/templated-retrospect/index.html'); foreach ($xpath->query($converter->toXPath('link')) as $elem) { $elem->setAttribute('href', '//localhost/xparser/tests/templated-retrospect/' . $elem->getAttribute('href')); } foreach ($xpath->query($converter->toXPath('img, script')) as $elem) { $elem->setAttribute('src', '//localhost/xparser/tests/templated-retrospect/' . $elem->getAttribute('src')); } $results[2] = $tpl->saveHTML(); return $results; }, $htmlstr); //$this->log('distance: ' . similar_text($htmlstr, $result)); $this->logBench($bench); $this->log('', true); $this->log('Simple HTML DOM Parser vs Ganon distance: ' . similar_text($resultsSimpleHtmlDomParser[2], $resultsGanon[2])); $this->log('Simple HTML DOM Parser vs XParser distance: ' . similar_text($resultsSimpleHtmlDomParser[2], $resultsXParser[2])); $this->log('Ganon vs XParser distance: ' . similar_text($resultsGanon[2], $resultsXParser[2])); $this->log('', true); $this->log('', true); }
/** * Parse the HTML page returned by the website and keep just the main body that we are interested in. * All embeded images are downloaded locally. * * @param string $body * @return array * @throws Exception */ public function extractContent($body) { // get just the main content $html = HtmlDomParser::str_get_html((string) $body); // daily journal page div[id=mainInner] $main_content = $html->find('body>div', 0)->outertext; $main_content = $this->highlightChildName($this->config['child_name'], $main_content); // embed the small image in the email $main_content = str_replace("/webui/Files/Room/large/", "/webui/Files/Room/small/", $main_content); $main_content = str_replace('"/webui/', '"' . $this->config['host'] . '/webui/', $main_content); // get all images and download them preg_match_all('|<img.*? src="(' . $this->config['host'] . '/webui/Files/Room/small/.*?)".*?>|', $main_content, $images); // process the images if there is any $content['images'] = []; if (isset($images[1])) { foreach ($images[1] as $image) { $content['images'][$image]['small'] = $this->fetchImage($image); // generate large image name $large_image = str_replace('small', 'large', $image); $content['images'][$image]['large'] = $this->fetchImage($large_image); } } // add some inline style to be used by the email client $main_content = $this->addStyles($main_content); $content['body'] = $main_content; // get the date the journal is from // daily journal selector: div[class=head-dailyjournal-txt] $date = $html->find('h1', 0)->innertext; // remove some extra content $date = preg_replace("|(\\d{4}).*\$|", "\$1", $date); $content['date'] = date("Y-m-d", strtotime(trim($date))); return $content; }
/** * Converts the loaded HTML into an HTML-string with inline styles based on the loaded CSS. * * @param bool $outputXHTML [optional] Should we output valid XHTML? * @param int|null $libXMLExtraOptions [optional] $libXMLExtraOptions Since PHP 5.4.0 and Libxml 2.6.0, * you may also use the options parameter to specify additional * Libxml parameters. * @param bool $path [optional] Set the path to your external css-files. * * @return string * * @throws Exception */ public function convert($outputXHTML = false, $libXMLExtraOptions = null, $path = false) { // init $outputXHTML = (bool) $outputXHTML; // validate if (!$this->html) { throw new Exception('No HTML provided.'); } // use local variables $css = $this->css; // create new HtmlDomParser $dom = HtmlDomParser::str_get_html($this->html, $libXMLExtraOptions); // check if there is some link css reference if ($this->loadCSSFromHTML) { foreach ($dom->find('link') as $node) { $file = ($path ?: __DIR__) . '/' . $node->getAttribute('href'); if (file_exists($file)) { $css .= file_get_contents($file); // converting to inline css because we don't need/want to load css files, so remove the link $node->outertext = ''; } } } // should we use inline style-block if ($this->useInlineStylesBlock) { if (true === $this->excludeConditionalInlineStylesBlock) { $this->html = preg_replace(self::$excludeConditionalInlineStylesBlockRegEx, '', $this->html); } $css .= $this->getCssFromInlineHtmlStyleBlock($this->html); } // process css $cssRules = $this->processCSS($css); // create new XPath $xPath = $this->createXPath($dom->getDocument(), $cssRules); // strip original style tags if we need to if ($this->stripOriginalStyleTags === true) { $this->stripOriginalStyleTags($xPath); } // cleanup the HTML if we need to if (true === $this->cleanup) { $this->cleanupHTML($xPath); } // should we output XHTML? if (true === $outputXHTML) { return $dom->xml(); } // just regular HTML 4.01 as it should be used in newsletters return $dom->html(); }