Example #1
1
 /**
  * Constructor.
  *
  * @param string $text     The text of the HTML document.
  * @param string $charset  The charset of the HTML document.
  *
  * @throws Exception
  */
 public function __construct($text, $charset = null)
 {
     if (!extension_loaded('dom')) {
         throw new Exception('DOM extension is not available.');
     }
     // Bug #9616: Make sure we have valid HTML input.
     if (!strlen($text)) {
         $text = '<html></html>';
     }
     $old_error = libxml_use_internal_errors(true);
     $doc = new DOMDocument();
     if (is_null($charset)) {
         /* If no charset given, charset is whatever libxml tells us the
          * encoding should be defaulting to 'iso-8859-1'. */
         $doc->loadHTML($text);
         $this->_origCharset = $doc->encoding ? $doc->encoding : 'iso-8859-1';
     } else {
         /* Convert/try with UTF-8 first. */
         $this->_origCharset = Horde_String::lower($charset);
         $this->_xmlencoding = '<?xml encoding="UTF-8"?>';
         $doc->loadHTML($this->_xmlencoding . Horde_String::convertCharset($text, $charset, 'UTF-8'));
         if ($doc->encoding && Horde_String::lower($doc->encoding) != 'utf-8') {
             /* Convert charset to what the HTML document says it SHOULD
              * be. */
             $doc->loadHTML(Horde_String::convertCharset($text, $charset, $doc->encoding));
             $this->_xmlencoding = '';
         }
     }
     if ($old_error) {
         libxml_use_internal_errors(false);
     }
     $this->dom = $doc;
 }
Example #2
1
 /**
  * @test
  */
 public function trimFromHTMLString()
 {
     $helper = new DOMHelper();
     $directory = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'htmlData' . DIRECTORY_SEPARATOR;
     $dom = new DOMDocument();
     $input = file_get_contents($directory . 'trimAfterString_input_1.html');
     // Si le marqueur n'existe pas, le texte est renvoyé intact
     $expected = str_replace("\n", "", $input);
     $actual = str_replace("\n", "", $helper->trimFromHTMLString($input, "{{XXXXXX}}"));
     $this->assertEquals(preg_replace('/\\s+/', '', $expected), preg_replace('/\\s+/', '', $actual));
     // Suppression simple
     $htmlHead = '<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"><title>***</title></head><body>';
     $htmlFoot = '</body></html>';
     $actual = str_replace("\n", "", $helper->trimFromHTMLString($input, "{{LIRE_LA_SUITE}}"));
     $dom->loadHTML($htmlHead . file_get_contents($directory . 'trimAfterString_output_1.html') . $htmlFoot);
     $expected = $this->cleanTmpHTML(str_replace("\n", "", $dom->saveHTML()), $htmlHead, $htmlFoot);
     $this->assertEquals(preg_replace('/\\s+/', '', $expected), preg_replace('/\\s+/', '', $actual));
     // Suppression avec insertion d'un bouton "Lire la suite"
     $actual = str_replace("\n", "", $helper->trimFromHTMLString($input, "{{LIRE_LA_SUITE}}", "<button>Lire la suite</button>"));
     $dom->loadHTML($htmlHead . file_get_contents($directory . 'trimAfterString_output_2.html') . $htmlFoot);
     $expected = $this->cleanTmpHTML(str_replace("\n", "", $dom->saveHTML()), $htmlHead, $htmlFoot);
     $this->assertEquals(preg_replace('/\\s+/', '', $expected), preg_replace('/\\s+/', '', $actual));
     // Suppression avec insertion d'un texte et d'un bouton "Lire la suite"
     $actual = str_replace("\n", "", $helper->trimFromHTMLString($input, "{{LIRE_LA_SUITE}}", "Pour en savoir plus : <button>Lire la suite</button>"));
     $dom->loadHTML($htmlHead . file_get_contents($directory . 'trimAfterString_output_3.html') . $htmlFoot);
     $expected = $this->cleanTmpHTML(str_replace("\n", "", $dom->saveHTML()), $htmlHead, $htmlFoot);
     $this->assertEquals(preg_replace('/\\s+/', '', $expected), preg_replace('/\\s+/', '', $actual));
 }
Example #3
1
 /**
  * @param $markup
  *
  * @throws \SxCore\Html\Exception\InvalidArgumentException
  */
 public function setMarkup($markup)
 {
     if (!is_string($markup)) {
         throw new Exception\InvalidArgumentException('Expected string. Got "' . gettype($markup) . '".');
     }
     $this->DOMDocument = new DOMDocument();
     $this->DOMDocument->loadHTML($markup);
 }
 /**
  * 
  * @return \DOMDocument
  */
 private function getDom()
 {
     if (is_null($this->dom)) {
         $this->dom = new \DOMDocument();
         $this->dom->loadHTML($this->htmlValidatorBodyContent);
     }
     return $this->dom;
 }
 /**
  * LinkedCssImporter constructor.
  * @param string $html
  * @param string $filePath
  * @param FileSystem $fileSystem
  */
 public function __construct($html, $filePath, FileSystem $fileSystem)
 {
     $this->document = new \DOMDocument();
     if (!$this->document->loadHTML($html)) {
         throw new \InvalidArgumentException('Cannot process HTML as a valid document');
     }
     $this->filePath = pathinfo($filePath, PATHINFO_DIRNAME);
     $this->fileSystem = $fileSystem;
 }
 /**
  * @param string $sHtml
  * @throws \InvalidArgumentException
  * @return \BoilerAppMessenger\StyleInliner\Processor\CssToInlineStylesProcessor
  */
 private function setHtml($sHtml)
 {
     if (is_string($sHtml)) {
         $this->domDocument = new \DOMDocument('1.0', $this->getEncoding());
         $this->domDocument->loadHTML(preg_replace('/[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F]+/u', '', $sHtml));
         $this->css = '';
         return $this->extractCss(null, $this->getBaseDir());
     }
     throw new \InvalidArgumentException('Html expects string, "' . gettype($sHtml) . '" given');
 }
function add_block_grids($content)
{
    // DOMDocument seems to have problems with the long dash, this fixes it.
    $content = mb_convert_encoding($content, 'utf-8', mb_detect_encoding($content));
    $content = mb_convert_encoding($content, 'html-entities', 'utf-8');
    $document = new DOMDocument('1.0', 'utf-8');
    set_error_handler(function () {
        /* ignore errors */
    });
    if (phpversion() >= 5.4) {
        $document->loadHTML($content, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
    } else {
        $document->loadHTML($content);
    }
    restore_error_handler();
    $xpath = new DOMXpath($document);
    $blocks = $xpath->query("//p[starts-with(.,'::')]");
    $block_groups = array();
    $block_group = null;
    $last_block = null;
    foreach ($blocks as $block) {
        $previous_sibling = get_real_previous_sibling($block);
        if ($last_block && $previous_sibling && $previous_sibling->isSameNode($last_block)) {
            // We're still in the same block group.
            $block_group[] = $block;
        } else {
            if ($block_group) {
                // We've found a new series of blocks, so start a new array for them.
                $block_groups[] = $block_group;
                $block_group = array($block);
            } else {
                // It's our first group
                $block_group = array($block);
                $block_groups[] =& $block_group;
            }
        }
        $last_block = $block;
    }
    foreach ($block_groups as $block_group) {
        $ul = $document->createElement('ul');
        $count = count($block_group);
        $ul->setAttribute('class', "medium-block-grid-{$count} takeaways innovate");
        // Insert the UL before the block group p tags
        $block_group[0]->parentNode->insertBefore($ul, $block_group[0]);
        foreach ($block_group as $block) {
            $li = $document->createElement('li');
            $block->nodeValue = str_replace('::', '', $block->nodeValue);
            $li->appendChild($block);
            $ul->appendChild($li);
        }
    }
    return preg_replace("~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\\s*~i", '', $document->saveHTML());
}
Example #8
1
 /**
  * Convert HTML to Apple News Markdown.
  *
  * @param string $html
  *   HTML to convert. Value is not validated, it is caller's responsibility
  *   to validate.
  *
  * @return string|NULL
  *   Markdown representation of the HTML, or NULL if failed.
  */
 public function convert($html)
 {
     if (preg_match('/^\\s*$/u', $html)) {
         return '';
     }
     $html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head>' . $html . '</body></html>';
     $this->dom = new \DOMDocument();
     if (!$this->dom->loadHTML($html)) {
         return NULL;
     }
     $xp = new \DOMXPath($this->dom);
     return implode(self::BLOCK_DELIMITER, $this->getBlocks($xp->query('/html/body')->item(0)->childNodes));
 }
Example #9
1
 /**
  * Tries to converts the given HTML into a plain text format
  *
  * @return string the HTML converted or empty string if not able to parse
  */
 function convert()
 {
     $output = '';
     libxml_use_internal_errors(true);
     $success = $this->document->loadHTML($this->html);
     libxml_clear_errors();
     if ($success) {
         $output = trim($this->render($this->document));
         // Post clean up
         $output = $this->postCleanUp($output);
     }
     return $output;
 }
 /**
  * Old-school Constructor
  * 
  * @param string $html       The html block, not a full document
  * @param string $encoding   The encoding used for the html block
  * @return WiziappDOMLoader   The html element as an array
  */
 function WiziappDOMLoader($html = '', $encoding = 'UTF-8')
 {
     $this->encoding = $encoding;
     if (!empty($html)) {
         $html = $this->prepareHTMLString($html);
         $this->dom = new DOMDocument('1.0', $this->encoding);
         libxml_use_internal_errors(true);
         @$this->dom->loadHTML($html);
         $this->dom->encoding = $this->encoding;
         libxml_clear_errors();
         $this->dom->preserveWhiteSpace = false;
     }
     return;
 }
Example #11
1
 /**
  * @param string $html
  * @return $this
  * @throws \Exception
  */
 public function loadHTML($html = "")
 {
     try {
         // The HTML is UTF-8 encoded
         $this->dom->loadHTML('<?xml encoding="UTF-8">' . $html);
         $this->dom->encoding = 'UTF-8';
     } catch (Exception $e) {
         $search = array('DOMDocument::loadHTML():', 'Entity');
         $replace = array(Yii::t('app', 'Check your code:'), Yii::t('app', 'Form'));
         $message = str_replace($search, $replace, $e->getMessage());
         throw new Exception($message, 5);
     }
     return $this;
 }
Example #12
1
 /**
  * Parses a full HTML document.
  * @param $text HTML text to parse
  * @param $builder Custom builder implementation
  * @return Parsed HTML as DOMDocument
  */
 public static function parse($text, $builder = null)
 {
     // Cleanup invalid HTML
     $doc = new DOMDocument();
     if (mb_detect_encoding($text, "UTF-8", true) == "UTF-8") {
         @$doc->loadHTML('<?xml encoding="UTF-8" ?>' . $text);
     } else {
         @$doc->loadHTML($text);
     }
     $text = $doc->saveHTML();
     $tokenizer = new HTML5_Tokenizer($text, $builder);
     $tokenizer->parse();
     return $tokenizer->save();
 }
Example #13
1
 public function testCorrectSetup()
 {
     $cloneable = $this->prepareValidCloneableField();
     $this->form->add($cloneable);
     $this->assertInstanceOf('\\Phalcon\\DI', $this->form->get('cloneable_field')->getDecorator()->getDI());
     $this->form->get('cloneable_field')->getDecorator()->setTemplateName('jquery');
     $domDoc = new \DOMDocument('1.0');
     $domDoc->loadHTML($this->form->get('cloneable_field')->render());
     $this->assertEquals(2, $domDoc->getElementById('cloneable_field')->getElementsByTagName('fieldset')->length);
     $this->assertEquals(4, $domDoc->getElementById('cloneable_field')->getElementsByTagName('input')->length);
     $domDoc->loadHTML($this->form->get('cloneable_field')->render(['attribute' => 'test']));
     $this->assertEquals('test', $domDoc->getElementById('cloneable_field')->attributes->getNamedItem('attribute')->value);
     $this->assertNull($this->form->get('cloneable_field')->getBaseElement('test3'));
     $this->assertInstanceOf('\\Phalcon\\Forms\\ElementInterface', $this->form->get('cloneable_field')->getBaseElement('test2'));
 }
Example #14
1
 public function __construct($html, $sourceLang, $targetLang)
 {
     $this->doc = new \DOMDocument();
     $this->doc->strictErrorChecking = FALSE;
     $this->sourceLang = $sourceLang;
     $this->targetLang = $targetLang;
     $error = $this->errorStart();
     // Setting meta below is a hack to get our DomDocument into utf-8. All other
     // methods tried didn't work.
     $success = $this->doc->loadHTML('<meta http-equiv="content-type" content="text/html; charset=utf-8"><div id="eggs-n-cereal-dont-ever-use-this-id">' . $html . '</div>');
     $this->errorStop($error);
     if (!$success) {
         throw new \Exception('Invalid HTML');
     }
 }
Example #15
1
File: Html.php Project: lortnus/zf1
 /**
  * Object constructor
  *
  * @param string  $data
  * @param boolean $isFile
  * @param boolean $storeContent
  */
 private function __construct($data, $isFile, $storeContent)
 {
     $this->_doc = new DOMDocument();
     $this->_doc->substituteEntities = true;
     if ($isFile) {
         $htmlData = file_get_contents($data);
     } else {
         $htmlData = $data;
     }
     @$this->_doc->loadHTML($htmlData);
     $xpath = new DOMXPath($this->_doc);
     $docTitle = '';
     $titleNodes = $xpath->query('/html/head/title');
     foreach ($titleNodes as $titleNode) {
         // title should always have only one entry, but we process all nodeset entries
         $docTitle .= $titleNode->nodeValue . ' ';
     }
     $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, $this->_doc->actualEncoding));
     $metaNodes = $xpath->query('/html/head/meta[@name]');
     foreach ($metaNodes as $metaNode) {
         $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'), $metaNode->getAttribute('content'), $this->_doc->actualEncoding));
     }
     $docBody = '';
     $bodyNodes = $xpath->query('/html/body');
     foreach ($bodyNodes as $bodyNode) {
         // body should always have only one entry, but we process all nodeset entries
         $this->_retrieveNodeText($bodyNode, $docBody);
     }
     if ($storeContent) {
         $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, $this->_doc->actualEncoding));
     } else {
         $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, $this->_doc->actualEncoding));
     }
     $linkNodes = $this->_doc->getElementsByTagName('a');
     foreach ($linkNodes as $linkNode) {
         if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) {
             $this->_links[] = $href;
         }
     }
     $this->_links = array_unique($this->_links);
     $linkNodes = $xpath->query('/html/head/link');
     foreach ($linkNodes as $linkNode) {
         if (($href = $linkNode->getAttribute('href')) != '') {
             $this->_headerLinks[] = $href;
         }
     }
     $this->_headerLinks = array_unique($this->_headerLinks);
 }
Example #16
1
function loadDOC($html)
{
    $doc = new DOMDocument('1.0', 'UTF8');
    $doc->formatOutput = false;
    @$doc->loadHTML($html, LIBXML_COMPACT | LIBXML_NOERROR | LIBXML_NOBLANKS | LIBXML_NOWARNING | LIBXML_ERR_NONE | LIBXML_NOXMLDECL | LIBXML_HTML_NODEFDTD | LIBXML_PARSEHUGE);
    return $doc;
}
 /**
  * {@inheritdoc}
  */
 public function parseHtml($html, $encoding = 'UTF-8')
 {
     $document = new \DOMDocument();
     foreach ($this->config as $name => $value) {
         $document->{$name} = $value;
     }
     $document->encoding = $encoding;
     if ($encoding !== false) {
         // Tell the parser which charset to use
         $encoding = $encoding ?: $document->encoding;
         $encoding = '<?xml encoding="' . $encoding . '" ?>';
         $html = $encoding . $html;
         // @codingStandardsIgnoreStart
         @$document->loadHTML($html);
         // @codingStandardsIgnoreEnd
         foreach ($document->childNodes as $item) {
             if ($item->nodeType == XML_PI_NODE) {
                 $document->removeChild($item);
             }
         }
     } else {
         // @codingStandardsIgnoreStart
         @$document->loadHTML($html);
         // @codingStandardsIgnoreEnd
     }
     return $document;
 }
Example #18
0
 public function __construct($htmlDocument)
 {
     $this->dom = new \DOMDocument();
     libxml_use_internal_errors(true);
     $this->dom->loadHTML($htmlDocument);
     $this->xpath = new \DOMXPath($this->dom);
 }
Example #19
0
 /**
  * @return DOMDocument DOM to manipulate
  */
 public function getDoc()
 {
     if (!$this->doc) {
         // DOMDocument::loadHTML apparently isn't very good with encodings, so
         // convert input to ASCII by encoding everything above 128 as entities.
         if (function_exists('mb_convert_encoding')) {
             $html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
         } else {
             $html = preg_replace_callback('/[\\x{80}-\\x{10ffff}]/u', function ($m) {
                 return '&#' . UtfNormal\Utils::utf8ToCodepoint($m[0]) . ';';
             }, $this->html);
         }
         // Workaround for bug that caused spaces before references
         // to disappear during processing: https://phabricator.wikimedia.org/T55086
         // TODO: Please replace with a better fix if one can be found.
         $html = str_replace(' <', '&#32;<', $html);
         libxml_use_internal_errors(true);
         $loader = libxml_disable_entity_loader();
         $this->doc = new DOMDocument();
         $this->doc->strictErrorChecking = false;
         $this->doc->loadHTML($html);
         libxml_disable_entity_loader($loader);
         libxml_use_internal_errors(false);
         $this->doc->encoding = 'UTF-8';
     }
     return $this->doc;
 }
Example #20
0
 public function parse($html)
 {
     if (empty($html)) {
         $this->title = $this->message = 'Empty exception';
         $this->sourceFile = '';
         return false;
     }
     if (!$this->domDocument) {
         $this->domDocument = new \DOMDocument();
     }
     @$this->domDocument->loadHTML($html);
     $titleItem = $this->domDocument->getElementsByTagName("title")->item(0);
     $this->title = $titleItem ? $titleItem->textContent : 'N/A';
     try {
         $sourceFileElement = $this->domDocument->getElementById("tracy-bs-error");
         if (is_object($sourceFileElement)) {
             $sourceFileLinkNode = $sourceFileElement->getElementsByTagName("a")->item(0);
             $this->sourceFile = trim($sourceFileLinkNode->textContent);
         } else {
             $this->sourceFile = 'Unknown format of exception';
         }
         $messageNode = $this->domDocument->getElementsByTagName("p")->item(0);
         if (is_object($messageNode)) {
             $messageNode->removeChild($messageNode->lastChild);
             $this->message = trim($messageNode->textContent);
         } else {
             $this->message = 'Unable to parse';
         }
     } catch (\Exception $e) {
         $this->message = 'Unable to parse';
     }
 }
 public function setHtml($html)
 {
     $this->html = $html;
     @$this->domDocument->loadHTML($html);
     $this->domXPath = new \DOMXPath($this->domDocument);
     return $this;
 }
Example #22
0
 public function testCreate()
 {
     $crawler = $this->client->request('GET', $this->getUrl('orocrm_sales_lead_create'));
     /** @var Form $form */
     $form = $crawler->selectButton('Save and Close')->form();
     $name = 'name' . $this->generateRandomString();
     $form['orocrm_sales_lead_form[name]'] = $name;
     $form['orocrm_sales_lead_form[firstName]'] = 'firstName';
     $form['orocrm_sales_lead_form[lastName]'] = 'lastName';
     $form['orocrm_sales_lead_form[address][city]'] = 'City Name';
     $form['orocrm_sales_lead_form[address][label]'] = 'Main Address';
     $form['orocrm_sales_lead_form[address][postalCode]'] = '10000';
     $form['orocrm_sales_lead_form[address][street2]'] = 'Second Street';
     $form['orocrm_sales_lead_form[address][street]'] = 'Main Street';
     $form['orocrm_sales_lead_form[companyName]'] = 'Company';
     $form['orocrm_sales_lead_form[email]'] = '*****@*****.**';
     $form['orocrm_sales_lead_form[owner]'] = 1;
     $form['orocrm_sales_lead_form[dataChannel]'] = $this->getReference('default_channel')->getId();
     $doc = new \DOMDocument("1.0");
     $doc->loadHTML('<select name="orocrm_sales_lead_form[address][country]" id="orocrm_sales_lead_form_address_country" ' . 'tabindex="-1" class="select2-offscreen"> ' . '<option value="" selected="selected"></option> ' . '<option value="US">United States</option> </select>');
     $field = new ChoiceFormField($doc->getElementsByTagName('select')->item(0));
     $form->set($field);
     $doc->loadHTML('<select name="orocrm_sales_lead_form[address][region]" id="orocrm_sales_lead_form_address_region" ' . 'tabindex="-1" class="select2-offscreen"> ' . '<option value="" selected="selected"></option> ' . '<option value="US-CA">California</option> </select>');
     $field = new ChoiceFormField($doc->getElementsByTagName('select')->item(0));
     $form->set($field);
     $form['orocrm_sales_lead_form[address][country]'] = 'US';
     $form['orocrm_sales_lead_form[address][region]'] = 'US-CA';
     $this->client->followRedirects(true);
     $crawler = $this->client->submit($form);
     $result = $this->client->getResponse();
     $this->assertHtmlResponseStatusCodeEquals($result, 200);
     $this->assertContains("Lead saved", $crawler->html());
     return $name;
 }
Example #23
0
 /**
  * Ctor
  * 
  * @param String $source
  * @return void
  */
 public function __construct($source, $currentURL = null)
 {
     $this->_dom = new DOMDocument();
     $this->_currentURL = $currentURL;
     // unfortunately neccessary until a error handler is implemented
     @$this->_dom->loadHTML($source);
 }
Example #24
0
 /**
  * Parse a given URL
  *
  * @param string $url
  */
 public function __construct($url)
 {
     $urlHTML = $this->getUrlHTML($url);
     // Disable bad formatted HTML warnings
     libxml_use_internal_errors(true);
     $this->DOM = new \DOMDocument();
     $this->DOM->loadHTML($urlHTML);
 }
 /**
  * {@inheritdoc}
  */
 public function __construct($uri)
 {
     $this->uri = new Uri($uri);
     $this->document = new \DOMDocument();
     $this->makeRequest($this->uri)->then(function (ResponseInterface $response) {
         @$this->document->loadHTML($response->getBody());
     });
 }
Example #26
0
 protected function _initialize($source, $requiresFix = false)
 {
     if ($requiresFix) {
         $source = $this->fixInvalidHtml($source);
     }
     $this->_dom = new \DOMDocument();
     if (!@$this->_dom->loadHTML($source)) {
         throw new FailedToLoadDomException(static::MESSAGE_DOM_EXCEPTION);
     }
     $this->_xpath = new \DOMXPath($this->_dom);
 }
 /**
  * DomCrawler constructor.
  * @param $pageContent
  */
 public function __construct($pageContent)
 {
     if (!$pageContent) {
         throw new \InvalidArgumentException('No page content provided');
     }
     $previous_value = libxml_use_internal_errors(true);
     $this->domDocument = new \DOMDocument();
     $this->domDocument->loadHTML($pageContent);
     libxml_clear_errors();
     libxml_use_internal_errors($previous_value);
 }
Example #28
0
 /**
  * {@inheritdoc}
  */
 public function parse($html)
 {
     $this->document->loadHTML($html);
     $expression = '//tr//td[contains(@class, "e")]';
     $xpath = new DOMXpath($this->document);
     $tds = $xpath->query($expression);
     $variables = array();
     foreach ($tds as $td) {
         $variables[trim($td->nodeValue)] = trim($td->nextSibling->nodeValue);
     }
     return $variables;
 }
 /**
  * 
  * @return \DOMDocument
  */
 private function getSourceDom()
 {
     if (is_null($this->sourceDom)) {
         $currentLibXmlUseInternalErrors = libxml_use_internal_errors();
         libxml_use_internal_errors(true);
         $this->sourceDom = new \DOMDocument();
         if ($this->sourceHtml != '') {
             $this->sourceDom->loadHTML($this->sourceHtml);
         }
         libxml_use_internal_errors($currentLibXmlUseInternalErrors);
     }
     return $this->sourceDom;
 }
Example #30
0
 public function tokenizeHTML($html, $config, &$context)
 {
     $html = $this->normalize($html, $config, $context);
     // attempt to armor stray angled brackets that cannot possibly
     // form tags and thus are probably being used as emoticons
     if ($config->get('Core', 'AggressivelyFixLt')) {
         $char = '[^a-z!\\/]';
         $comment = "/<!--(.*?)(-->|\\z)/is";
         $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackArmorCommentEntities'), $html);
         $html = preg_replace("/<({$char})/i", '&lt;\\1', $html);
         $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackUndoCommentSubst'), $html);
         // fix comments
     }
     // preprocess html, essential for UTF-8
     $html = $this->wrapHTML($html, $config, $context);
     $doc = new DOMDocument();
     $doc->encoding = 'UTF-8';
     // theoretically, the above has this covered
     set_error_handler(array($this, 'muteErrorHandler'));
     $doc->loadHTML($html);
     restore_error_handler();
     $tokens = array();
     $this->tokenizeDOM($doc->getElementsByTagName('html')->item(0)->getElementsByTagName('body')->item(0)->getElementsByTagName('div')->item(0), $tokens);
     return $tokens;
 }