Example #1
0
File: Page.php Project: izosa/page
 public function find($query, $index = null)
 {
     if (is_null($this->dom)) {
         $this->dom = HtmlDomParser::str_get_html($this->html);
     }
     $this->dom->find($query, $index);
 }
Example #2
0
 /**
  * @param HtmlDomParser $newDocument
  *
  * @return HtmlDomParser
  */
 protected function cleanHtmlWrapper(HtmlDomParser $newDocument)
 {
     if ($newDocument->getIsDOMDocumentCreatedWithoutHtml() === true) {
         // Remove doc-type node.
         $newDocument->getDocument()->doctype->parentNode->removeChild($newDocument->getDocument()->doctype);
         // Remove html element, preserving child nodes.
         $html = $newDocument->getDocument()->getElementsByTagName('html')->item(0);
         $fragment = $newDocument->getDocument()->createDocumentFragment();
         while ($html->childNodes->length > 0) {
             $fragment->appendChild($html->childNodes->item(0));
         }
         $html->parentNode->replaceChild($fragment, $html);
         // Remove body element, preserving child nodes.
         $body = $newDocument->getDocument()->getElementsByTagName('body')->item(0);
         $fragment = $newDocument->getDocument()->createDocumentFragment();
         while ($body->childNodes->length > 0) {
             $fragment->appendChild($body->childNodes->item(0));
         }
         $body->parentNode->replaceChild($fragment, $body);
         // At this point DOMDocument still added a "<p>"-wrapper around our string,
         // so we replace it with "<simpleHtmlDomP>" and delete this at the ending ...
         $this->changeElementName($newDocument->getDocument()->getElementsByTagName('p')->item(0), 'simpleHtmlDomP');
     }
     return $newDocument;
 }
 protected function compare()
 {
     $bench = new Ubench();
     $url = 'tests/templated-retrospect/index.html';
     $file = 'test.html';
     if (!file_exists($file)) {
         $htmlstr = file_get_contents($url);
         file_put_contents($file, $htmlstr);
     }
     $htmlstr = file_get_contents($file);
     $this->log('', true);
     $this->log('Measuring Simple HTML DOM Parser...');
     $resultsSimpleHtmlDomParser = $bench->run(function ($htmlstr) {
         $results = [];
         $html = HtmlDomParser::str_get_html($htmlstr);
         $html->find('title', 0)->innertext('New Title');
         $results[1] = $html->__toString();
         $tpl = HtmlDomParser::str_get_html(file_get_contents('tests/templated-retrospect/index.html'));
         foreach ($tpl->find('link') as $elem) {
             $elem->href = '//localhost/xparser/tests/templated-retrospect/' . $elem->href;
         }
         foreach ($tpl->find('img, script') as $elem) {
             $elem->src = '//localhost/xparser/tests/templated-retrospect/' . $elem->src;
         }
         $results[2] = $tpl->__toString();
         return $results;
     }, $htmlstr);
     //$this->log('distance: ' . similar_text($htmlstr, $result));
     $this->logBench($bench);
     $this->log('', true);
     $this->log('Measuring XParser...');
     $resultsXParser = $bench->run(function ($htmlstr) {
         $results = [];
         $html = new XNode($htmlstr);
         $html->find('title')->inner('New Title');
         $results[1] = $html->__toString();
         $tpl = new XNode(file_get_contents('tests/templated-retrospect/index.html'));
         foreach ($tpl('link') as $elem) {
             $elem->href = '//localhost/xparser/tests/templated-retrospect/' . $elem->href;
         }
         foreach ($tpl('img, script') as $elem) {
             $elem->src = '//localhost/xparser/tests/templated-retrospect/' . $elem->src;
         }
         $results[2] = $tpl->__toString();
         return $results;
     }, $htmlstr);
     //$this->log('distance: ' . similar_text($htmlstr, $result));
     $this->logBench($bench);
     $this->log('', true);
     $this->log('Measuring Ganon...');
     $resultsGanon = $bench->run(function ($htmlstr) {
         $html = str_get_dom($htmlstr);
         foreach ($html('title') as $title) {
             $title->setInnerText('New Title');
         }
         $results[1] = $html->__toString();
         $tpl = new XNode(file_get_contents('tests/templated-retrospect/index.html'));
         foreach ($tpl('link') as $elem) {
             $elem->href = '//localhost/xparser/tests/templated-retrospect/' . $elem->href;
         }
         foreach ($tpl('img, script') as $elem) {
             $elem->src = '//localhost/xparser/tests/templated-retrospect/' . $elem->src;
         }
         $results[2] = $tpl->__toString();
         return $results;
     }, $htmlstr);
     //$this->log('distance: ' . similar_text($htmlstr, $result));
     $this->logBench($bench);
     $this->log('', true);
     $this->log('Symfony CSS Selector combined with DOMDocument and DOMXPath...');
     $resultsXParser = $bench->run(function ($htmlstr) {
         $results = [];
         $html = new DOMDocument();
         libxml_use_internal_errors(true);
         $html->loadHTML($htmlstr);
         $converter = new CssSelectorConverter();
         $xpath = new DOMXPath($html);
         $elements = $xpath->query($converter->toXPath('title'));
         foreach ($elements as $element) {
             $element->innserHTML = 'New Title';
         }
         $results[1] = $html->saveHTML();
         $tpl = new DOMDocument();
         $tpl->load('tests/templated-retrospect/index.html');
         foreach ($xpath->query($converter->toXPath('link')) as $elem) {
             $elem->setAttribute('href', '//localhost/xparser/tests/templated-retrospect/' . $elem->getAttribute('href'));
         }
         foreach ($xpath->query($converter->toXPath('img, script')) as $elem) {
             $elem->setAttribute('src', '//localhost/xparser/tests/templated-retrospect/' . $elem->getAttribute('src'));
         }
         $results[2] = $tpl->saveHTML();
         return $results;
     }, $htmlstr);
     //$this->log('distance: ' . similar_text($htmlstr, $result));
     $this->logBench($bench);
     $this->log('', true);
     $this->log('Simple HTML DOM Parser vs Ganon distance: ' . similar_text($resultsSimpleHtmlDomParser[2], $resultsGanon[2]));
     $this->log('Simple HTML DOM Parser vs XParser distance: ' . similar_text($resultsSimpleHtmlDomParser[2], $resultsXParser[2]));
     $this->log('Ganon vs XParser distance: ' . similar_text($resultsGanon[2], $resultsXParser[2]));
     $this->log('', true);
     $this->log('', true);
 }
 /**
  * Parse the HTML page returned by the website and keep just the main body that we are interested in.
  * All embeded images are downloaded locally.
  *
  * @param string $body
  * @return array
  * @throws Exception
  */
 public function extractContent($body)
 {
     // get just the main content
     $html = HtmlDomParser::str_get_html((string) $body);
     // daily journal page div[id=mainInner]
     $main_content = $html->find('body>div', 0)->outertext;
     $main_content = $this->highlightChildName($this->config['child_name'], $main_content);
     // embed the small image in the email
     $main_content = str_replace("/webui/Files/Room/large/", "/webui/Files/Room/small/", $main_content);
     $main_content = str_replace('"/webui/', '"' . $this->config['host'] . '/webui/', $main_content);
     // get all images and download them
     preg_match_all('|<img.*? src="(' . $this->config['host'] . '/webui/Files/Room/small/.*?)".*?>|', $main_content, $images);
     // process the images if there is any
     $content['images'] = [];
     if (isset($images[1])) {
         foreach ($images[1] as $image) {
             $content['images'][$image]['small'] = $this->fetchImage($image);
             // generate large image name
             $large_image = str_replace('small', 'large', $image);
             $content['images'][$image]['large'] = $this->fetchImage($large_image);
         }
     }
     // add some inline style to be used by the email client
     $main_content = $this->addStyles($main_content);
     $content['body'] = $main_content;
     // get the date the journal is from
     // daily journal selector: div[class=head-dailyjournal-txt]
     $date = $html->find('h1', 0)->innertext;
     // remove some extra content
     $date = preg_replace("|(\\d{4}).*\$|", "\$1", $date);
     $content['date'] = date("Y-m-d", strtotime(trim($date)));
     return $content;
 }
 /**
  * Converts the loaded HTML into an HTML-string with inline styles based on the loaded CSS.
  *
  * @param bool $outputXHTML            [optional] Should we output valid XHTML?
  * @param int|null $libXMLExtraOptions [optional] $libXMLExtraOptions Since PHP 5.4.0 and Libxml 2.6.0,
  *                                     you may also use the options parameter to specify additional
  *                                     Libxml parameters.
  * @param bool $path                   [optional] Set the path to your external css-files.
  *
  * @return string
  *
  * @throws Exception
  */
 public function convert($outputXHTML = false, $libXMLExtraOptions = null, $path = false)
 {
     // init
     $outputXHTML = (bool) $outputXHTML;
     // validate
     if (!$this->html) {
         throw new Exception('No HTML provided.');
     }
     // use local variables
     $css = $this->css;
     // create new HtmlDomParser
     $dom = HtmlDomParser::str_get_html($this->html, $libXMLExtraOptions);
     // check if there is some link css reference
     if ($this->loadCSSFromHTML) {
         foreach ($dom->find('link') as $node) {
             $file = ($path ?: __DIR__) . '/' . $node->getAttribute('href');
             if (file_exists($file)) {
                 $css .= file_get_contents($file);
                 // converting to inline css because we don't need/want to load css files, so remove the link
                 $node->outertext = '';
             }
         }
     }
     // should we use inline style-block
     if ($this->useInlineStylesBlock) {
         if (true === $this->excludeConditionalInlineStylesBlock) {
             $this->html = preg_replace(self::$excludeConditionalInlineStylesBlockRegEx, '', $this->html);
         }
         $css .= $this->getCssFromInlineHtmlStyleBlock($this->html);
     }
     // process css
     $cssRules = $this->processCSS($css);
     // create new XPath
     $xPath = $this->createXPath($dom->getDocument(), $cssRules);
     // strip original style tags if we need to
     if ($this->stripOriginalStyleTags === true) {
         $this->stripOriginalStyleTags($xPath);
     }
     // cleanup the HTML if we need to
     if (true === $this->cleanup) {
         $this->cleanupHTML($xPath);
     }
     // should we output XHTML?
     if (true === $outputXHTML) {
         return $dom->xml();
     }
     // just regular HTML 4.01 as it should be used in newsletters
     return $dom->html();
 }