/** * Constructor. * * @param string $text The text of the HTML document. * @param string $charset The charset of the HTML document. * * @throws Exception */ public function __construct($text, $charset = null) { if (!extension_loaded('dom')) { throw new Exception('DOM extension is not available.'); } // Bug #9616: Make sure we have valid HTML input. if (!strlen($text)) { $text = '<html></html>'; } $old_error = libxml_use_internal_errors(true); $doc = new DOMDocument(); if (is_null($charset)) { /* If no charset given, charset is whatever libxml tells us the * encoding should be defaulting to 'iso-8859-1'. */ $doc->loadHTML($text); $this->_origCharset = $doc->encoding ? $doc->encoding : 'iso-8859-1'; } else { /* Convert/try with UTF-8 first. */ $this->_origCharset = Horde_String::lower($charset); $this->_xmlencoding = '<?xml encoding="UTF-8"?>'; $doc->loadHTML($this->_xmlencoding . Horde_String::convertCharset($text, $charset, 'UTF-8')); if ($doc->encoding && Horde_String::lower($doc->encoding) != 'utf-8') { /* Convert charset to what the HTML document says it SHOULD * be. */ $doc->loadHTML(Horde_String::convertCharset($text, $charset, $doc->encoding)); $this->_xmlencoding = ''; } } if ($old_error) { libxml_use_internal_errors(false); } $this->dom = $doc; }
/** * @test */ public function trimFromHTMLString() { $helper = new DOMHelper(); $directory = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'htmlData' . DIRECTORY_SEPARATOR; $dom = new DOMDocument(); $input = file_get_contents($directory . 'trimAfterString_input_1.html'); // Si le marqueur n'existe pas, le texte est renvoyé intact $expected = str_replace("\n", "", $input); $actual = str_replace("\n", "", $helper->trimFromHTMLString($input, "{{XXXXXX}}")); $this->assertEquals(preg_replace('/\\s+/', '', $expected), preg_replace('/\\s+/', '', $actual)); // Suppression simple $htmlHead = '<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"><title>***</title></head><body>'; $htmlFoot = '</body></html>'; $actual = str_replace("\n", "", $helper->trimFromHTMLString($input, "{{LIRE_LA_SUITE}}")); $dom->loadHTML($htmlHead . file_get_contents($directory . 'trimAfterString_output_1.html') . $htmlFoot); $expected = $this->cleanTmpHTML(str_replace("\n", "", $dom->saveHTML()), $htmlHead, $htmlFoot); $this->assertEquals(preg_replace('/\\s+/', '', $expected), preg_replace('/\\s+/', '', $actual)); // Suppression avec insertion d'un bouton "Lire la suite" $actual = str_replace("\n", "", $helper->trimFromHTMLString($input, "{{LIRE_LA_SUITE}}", "<button>Lire la suite</button>")); $dom->loadHTML($htmlHead . file_get_contents($directory . 'trimAfterString_output_2.html') . $htmlFoot); $expected = $this->cleanTmpHTML(str_replace("\n", "", $dom->saveHTML()), $htmlHead, $htmlFoot); $this->assertEquals(preg_replace('/\\s+/', '', $expected), preg_replace('/\\s+/', '', $actual)); // Suppression avec insertion d'un texte et d'un bouton "Lire la suite" $actual = str_replace("\n", "", $helper->trimFromHTMLString($input, "{{LIRE_LA_SUITE}}", "Pour en savoir plus : <button>Lire la suite</button>")); $dom->loadHTML($htmlHead . file_get_contents($directory . 'trimAfterString_output_3.html') . $htmlFoot); $expected = $this->cleanTmpHTML(str_replace("\n", "", $dom->saveHTML()), $htmlHead, $htmlFoot); $this->assertEquals(preg_replace('/\\s+/', '', $expected), preg_replace('/\\s+/', '', $actual)); }
/** * @param $markup * * @throws \SxCore\Html\Exception\InvalidArgumentException */ public function setMarkup($markup) { if (!is_string($markup)) { throw new Exception\InvalidArgumentException('Expected string. Got "' . gettype($markup) . '".'); } $this->DOMDocument = new DOMDocument(); $this->DOMDocument->loadHTML($markup); }
/** * * @return \DOMDocument */ private function getDom() { if (is_null($this->dom)) { $this->dom = new \DOMDocument(); $this->dom->loadHTML($this->htmlValidatorBodyContent); } return $this->dom; }
/** * LinkedCssImporter constructor. * @param string $html * @param string $filePath * @param FileSystem $fileSystem */ public function __construct($html, $filePath, FileSystem $fileSystem) { $this->document = new \DOMDocument(); if (!$this->document->loadHTML($html)) { throw new \InvalidArgumentException('Cannot process HTML as a valid document'); } $this->filePath = pathinfo($filePath, PATHINFO_DIRNAME); $this->fileSystem = $fileSystem; }
/** * @param string $sHtml * @throws \InvalidArgumentException * @return \BoilerAppMessenger\StyleInliner\Processor\CssToInlineStylesProcessor */ private function setHtml($sHtml) { if (is_string($sHtml)) { $this->domDocument = new \DOMDocument('1.0', $this->getEncoding()); $this->domDocument->loadHTML(preg_replace('/[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F]+/u', '', $sHtml)); $this->css = ''; return $this->extractCss(null, $this->getBaseDir()); } throw new \InvalidArgumentException('Html expects string, "' . gettype($sHtml) . '" given'); }
function add_block_grids($content) { // DOMDocument seems to have problems with the long dash, this fixes it. $content = mb_convert_encoding($content, 'utf-8', mb_detect_encoding($content)); $content = mb_convert_encoding($content, 'html-entities', 'utf-8'); $document = new DOMDocument('1.0', 'utf-8'); set_error_handler(function () { /* ignore errors */ }); if (phpversion() >= 5.4) { $document->loadHTML($content, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); } else { $document->loadHTML($content); } restore_error_handler(); $xpath = new DOMXpath($document); $blocks = $xpath->query("//p[starts-with(.,'::')]"); $block_groups = array(); $block_group = null; $last_block = null; foreach ($blocks as $block) { $previous_sibling = get_real_previous_sibling($block); if ($last_block && $previous_sibling && $previous_sibling->isSameNode($last_block)) { // We're still in the same block group. $block_group[] = $block; } else { if ($block_group) { // We've found a new series of blocks, so start a new array for them. $block_groups[] = $block_group; $block_group = array($block); } else { // It's our first group $block_group = array($block); $block_groups[] =& $block_group; } } $last_block = $block; } foreach ($block_groups as $block_group) { $ul = $document->createElement('ul'); $count = count($block_group); $ul->setAttribute('class', "medium-block-grid-{$count} takeaways innovate"); // Insert the UL before the block group p tags $block_group[0]->parentNode->insertBefore($ul, $block_group[0]); foreach ($block_group as $block) { $li = $document->createElement('li'); $block->nodeValue = str_replace('::', '', $block->nodeValue); $li->appendChild($block); $ul->appendChild($li); } } return preg_replace("~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\\s*~i", '', $document->saveHTML()); }
/** * Convert HTML to Apple News Markdown. * * @param string $html * HTML to convert. Value is not validated, it is caller's responsibility * to validate. * * @return string|NULL * Markdown representation of the HTML, or NULL if failed. */ public function convert($html) { if (preg_match('/^\\s*$/u', $html)) { return ''; } $html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head>' . $html . '</body></html>'; $this->dom = new \DOMDocument(); if (!$this->dom->loadHTML($html)) { return NULL; } $xp = new \DOMXPath($this->dom); return implode(self::BLOCK_DELIMITER, $this->getBlocks($xp->query('/html/body')->item(0)->childNodes)); }
/** * Tries to converts the given HTML into a plain text format * * @return string the HTML converted or empty string if not able to parse */ function convert() { $output = ''; libxml_use_internal_errors(true); $success = $this->document->loadHTML($this->html); libxml_clear_errors(); if ($success) { $output = trim($this->render($this->document)); // Post clean up $output = $this->postCleanUp($output); } return $output; }
/** * Old-school Constructor * * @param string $html The html block, not a full document * @param string $encoding The encoding used for the html block * @return WiziappDOMLoader The html element as an array */ function WiziappDOMLoader($html = '', $encoding = 'UTF-8') { $this->encoding = $encoding; if (!empty($html)) { $html = $this->prepareHTMLString($html); $this->dom = new DOMDocument('1.0', $this->encoding); libxml_use_internal_errors(true); @$this->dom->loadHTML($html); $this->dom->encoding = $this->encoding; libxml_clear_errors(); $this->dom->preserveWhiteSpace = false; } return; }
/** * @param string $html * @return $this * @throws \Exception */ public function loadHTML($html = "") { try { // The HTML is UTF-8 encoded $this->dom->loadHTML('<?xml encoding="UTF-8">' . $html); $this->dom->encoding = 'UTF-8'; } catch (Exception $e) { $search = array('DOMDocument::loadHTML():', 'Entity'); $replace = array(Yii::t('app', 'Check your code:'), Yii::t('app', 'Form')); $message = str_replace($search, $replace, $e->getMessage()); throw new Exception($message, 5); } return $this; }
/** * Parses a full HTML document. * @param $text HTML text to parse * @param $builder Custom builder implementation * @return Parsed HTML as DOMDocument */ public static function parse($text, $builder = null) { // Cleanup invalid HTML $doc = new DOMDocument(); if (mb_detect_encoding($text, "UTF-8", true) == "UTF-8") { @$doc->loadHTML('<?xml encoding="UTF-8" ?>' . $text); } else { @$doc->loadHTML($text); } $text = $doc->saveHTML(); $tokenizer = new HTML5_Tokenizer($text, $builder); $tokenizer->parse(); return $tokenizer->save(); }
public function testCorrectSetup() { $cloneable = $this->prepareValidCloneableField(); $this->form->add($cloneable); $this->assertInstanceOf('\\Phalcon\\DI', $this->form->get('cloneable_field')->getDecorator()->getDI()); $this->form->get('cloneable_field')->getDecorator()->setTemplateName('jquery'); $domDoc = new \DOMDocument('1.0'); $domDoc->loadHTML($this->form->get('cloneable_field')->render()); $this->assertEquals(2, $domDoc->getElementById('cloneable_field')->getElementsByTagName('fieldset')->length); $this->assertEquals(4, $domDoc->getElementById('cloneable_field')->getElementsByTagName('input')->length); $domDoc->loadHTML($this->form->get('cloneable_field')->render(['attribute' => 'test'])); $this->assertEquals('test', $domDoc->getElementById('cloneable_field')->attributes->getNamedItem('attribute')->value); $this->assertNull($this->form->get('cloneable_field')->getBaseElement('test3')); $this->assertInstanceOf('\\Phalcon\\Forms\\ElementInterface', $this->form->get('cloneable_field')->getBaseElement('test2')); }
public function __construct($html, $sourceLang, $targetLang) { $this->doc = new \DOMDocument(); $this->doc->strictErrorChecking = FALSE; $this->sourceLang = $sourceLang; $this->targetLang = $targetLang; $error = $this->errorStart(); // Setting meta below is a hack to get our DomDocument into utf-8. All other // methods tried didn't work. $success = $this->doc->loadHTML('<meta http-equiv="content-type" content="text/html; charset=utf-8"><div id="eggs-n-cereal-dont-ever-use-this-id">' . $html . '</div>'); $this->errorStop($error); if (!$success) { throw new \Exception('Invalid HTML'); } }
/** * Object constructor * * @param string $data * @param boolean $isFile * @param boolean $storeContent */ private function __construct($data, $isFile, $storeContent) { $this->_doc = new DOMDocument(); $this->_doc->substituteEntities = true; if ($isFile) { $htmlData = file_get_contents($data); } else { $htmlData = $data; } @$this->_doc->loadHTML($htmlData); $xpath = new DOMXPath($this->_doc); $docTitle = ''; $titleNodes = $xpath->query('/html/head/title'); foreach ($titleNodes as $titleNode) { // title should always have only one entry, but we process all nodeset entries $docTitle .= $titleNode->nodeValue . ' '; } $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, $this->_doc->actualEncoding)); $metaNodes = $xpath->query('/html/head/meta[@name]'); foreach ($metaNodes as $metaNode) { $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'), $metaNode->getAttribute('content'), $this->_doc->actualEncoding)); } $docBody = ''; $bodyNodes = $xpath->query('/html/body'); foreach ($bodyNodes as $bodyNode) { // body should always have only one entry, but we process all nodeset entries $this->_retrieveNodeText($bodyNode, $docBody); } if ($storeContent) { $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, $this->_doc->actualEncoding)); } else { $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, $this->_doc->actualEncoding)); } $linkNodes = $this->_doc->getElementsByTagName('a'); foreach ($linkNodes as $linkNode) { if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) { $this->_links[] = $href; } } $this->_links = array_unique($this->_links); $linkNodes = $xpath->query('/html/head/link'); foreach ($linkNodes as $linkNode) { if (($href = $linkNode->getAttribute('href')) != '') { $this->_headerLinks[] = $href; } } $this->_headerLinks = array_unique($this->_headerLinks); }
function loadDOC($html) { $doc = new DOMDocument('1.0', 'UTF8'); $doc->formatOutput = false; @$doc->loadHTML($html, LIBXML_COMPACT | LIBXML_NOERROR | LIBXML_NOBLANKS | LIBXML_NOWARNING | LIBXML_ERR_NONE | LIBXML_NOXMLDECL | LIBXML_HTML_NODEFDTD | LIBXML_PARSEHUGE); return $doc; }
/** * {@inheritdoc} */ public function parseHtml($html, $encoding = 'UTF-8') { $document = new \DOMDocument(); foreach ($this->config as $name => $value) { $document->{$name} = $value; } $document->encoding = $encoding; if ($encoding !== false) { // Tell the parser which charset to use $encoding = $encoding ?: $document->encoding; $encoding = '<?xml encoding="' . $encoding . '" ?>'; $html = $encoding . $html; // @codingStandardsIgnoreStart @$document->loadHTML($html); // @codingStandardsIgnoreEnd foreach ($document->childNodes as $item) { if ($item->nodeType == XML_PI_NODE) { $document->removeChild($item); } } } else { // @codingStandardsIgnoreStart @$document->loadHTML($html); // @codingStandardsIgnoreEnd } return $document; }
public function __construct($htmlDocument) { $this->dom = new \DOMDocument(); libxml_use_internal_errors(true); $this->dom->loadHTML($htmlDocument); $this->xpath = new \DOMXPath($this->dom); }
/** * @return DOMDocument DOM to manipulate */ public function getDoc() { if (!$this->doc) { // DOMDocument::loadHTML apparently isn't very good with encodings, so // convert input to ASCII by encoding everything above 128 as entities. if (function_exists('mb_convert_encoding')) { $html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); } else { $html = preg_replace_callback('/[\\x{80}-\\x{10ffff}]/u', function ($m) { return '&#' . UtfNormal\Utils::utf8ToCodepoint($m[0]) . ';'; }, $this->html); } // Workaround for bug that caused spaces before references // to disappear during processing: https://phabricator.wikimedia.org/T55086 // TODO: Please replace with a better fix if one can be found. $html = str_replace(' <', ' <', $html); libxml_use_internal_errors(true); $loader = libxml_disable_entity_loader(); $this->doc = new DOMDocument(); $this->doc->strictErrorChecking = false; $this->doc->loadHTML($html); libxml_disable_entity_loader($loader); libxml_use_internal_errors(false); $this->doc->encoding = 'UTF-8'; } return $this->doc; }
public function parse($html) { if (empty($html)) { $this->title = $this->message = 'Empty exception'; $this->sourceFile = ''; return false; } if (!$this->domDocument) { $this->domDocument = new \DOMDocument(); } @$this->domDocument->loadHTML($html); $titleItem = $this->domDocument->getElementsByTagName("title")->item(0); $this->title = $titleItem ? $titleItem->textContent : 'N/A'; try { $sourceFileElement = $this->domDocument->getElementById("tracy-bs-error"); if (is_object($sourceFileElement)) { $sourceFileLinkNode = $sourceFileElement->getElementsByTagName("a")->item(0); $this->sourceFile = trim($sourceFileLinkNode->textContent); } else { $this->sourceFile = 'Unknown format of exception'; } $messageNode = $this->domDocument->getElementsByTagName("p")->item(0); if (is_object($messageNode)) { $messageNode->removeChild($messageNode->lastChild); $this->message = trim($messageNode->textContent); } else { $this->message = 'Unable to parse'; } } catch (\Exception $e) { $this->message = 'Unable to parse'; } }
public function setHtml($html) { $this->html = $html; @$this->domDocument->loadHTML($html); $this->domXPath = new \DOMXPath($this->domDocument); return $this; }
public function testCreate() { $crawler = $this->client->request('GET', $this->getUrl('orocrm_sales_lead_create')); /** @var Form $form */ $form = $crawler->selectButton('Save and Close')->form(); $name = 'name' . $this->generateRandomString(); $form['orocrm_sales_lead_form[name]'] = $name; $form['orocrm_sales_lead_form[firstName]'] = 'firstName'; $form['orocrm_sales_lead_form[lastName]'] = 'lastName'; $form['orocrm_sales_lead_form[address][city]'] = 'City Name'; $form['orocrm_sales_lead_form[address][label]'] = 'Main Address'; $form['orocrm_sales_lead_form[address][postalCode]'] = '10000'; $form['orocrm_sales_lead_form[address][street2]'] = 'Second Street'; $form['orocrm_sales_lead_form[address][street]'] = 'Main Street'; $form['orocrm_sales_lead_form[companyName]'] = 'Company'; $form['orocrm_sales_lead_form[email]'] = '*****@*****.**'; $form['orocrm_sales_lead_form[owner]'] = 1; $form['orocrm_sales_lead_form[dataChannel]'] = $this->getReference('default_channel')->getId(); $doc = new \DOMDocument("1.0"); $doc->loadHTML('<select name="orocrm_sales_lead_form[address][country]" id="orocrm_sales_lead_form_address_country" ' . 'tabindex="-1" class="select2-offscreen"> ' . '<option value="" selected="selected"></option> ' . '<option value="US">United States</option> </select>'); $field = new ChoiceFormField($doc->getElementsByTagName('select')->item(0)); $form->set($field); $doc->loadHTML('<select name="orocrm_sales_lead_form[address][region]" id="orocrm_sales_lead_form_address_region" ' . 'tabindex="-1" class="select2-offscreen"> ' . '<option value="" selected="selected"></option> ' . '<option value="US-CA">California</option> </select>'); $field = new ChoiceFormField($doc->getElementsByTagName('select')->item(0)); $form->set($field); $form['orocrm_sales_lead_form[address][country]'] = 'US'; $form['orocrm_sales_lead_form[address][region]'] = 'US-CA'; $this->client->followRedirects(true); $crawler = $this->client->submit($form); $result = $this->client->getResponse(); $this->assertHtmlResponseStatusCodeEquals($result, 200); $this->assertContains("Lead saved", $crawler->html()); return $name; }
/** * Ctor * * @param String $source * @return void */ public function __construct($source, $currentURL = null) { $this->_dom = new DOMDocument(); $this->_currentURL = $currentURL; // unfortunately neccessary until a error handler is implemented @$this->_dom->loadHTML($source); }
/** * Parse a given URL * * @param string $url */ public function __construct($url) { $urlHTML = $this->getUrlHTML($url); // Disable bad formatted HTML warnings libxml_use_internal_errors(true); $this->DOM = new \DOMDocument(); $this->DOM->loadHTML($urlHTML); }
/** * {@inheritdoc} */ public function __construct($uri) { $this->uri = new Uri($uri); $this->document = new \DOMDocument(); $this->makeRequest($this->uri)->then(function (ResponseInterface $response) { @$this->document->loadHTML($response->getBody()); }); }
protected function _initialize($source, $requiresFix = false) { if ($requiresFix) { $source = $this->fixInvalidHtml($source); } $this->_dom = new \DOMDocument(); if (!@$this->_dom->loadHTML($source)) { throw new FailedToLoadDomException(static::MESSAGE_DOM_EXCEPTION); } $this->_xpath = new \DOMXPath($this->_dom); }
/** * DomCrawler constructor. * @param $pageContent */ public function __construct($pageContent) { if (!$pageContent) { throw new \InvalidArgumentException('No page content provided'); } $previous_value = libxml_use_internal_errors(true); $this->domDocument = new \DOMDocument(); $this->domDocument->loadHTML($pageContent); libxml_clear_errors(); libxml_use_internal_errors($previous_value); }
/** * {@inheritdoc} */ public function parse($html) { $this->document->loadHTML($html); $expression = '//tr//td[contains(@class, "e")]'; $xpath = new DOMXpath($this->document); $tds = $xpath->query($expression); $variables = array(); foreach ($tds as $td) { $variables[trim($td->nodeValue)] = trim($td->nextSibling->nodeValue); } return $variables; }
/** * * @return \DOMDocument */ private function getSourceDom() { if (is_null($this->sourceDom)) { $currentLibXmlUseInternalErrors = libxml_use_internal_errors(); libxml_use_internal_errors(true); $this->sourceDom = new \DOMDocument(); if ($this->sourceHtml != '') { $this->sourceDom->loadHTML($this->sourceHtml); } libxml_use_internal_errors($currentLibXmlUseInternalErrors); } return $this->sourceDom; }
public function tokenizeHTML($html, $config, &$context) { $html = $this->normalize($html, $config, $context); // attempt to armor stray angled brackets that cannot possibly // form tags and thus are probably being used as emoticons if ($config->get('Core', 'AggressivelyFixLt')) { $char = '[^a-z!\\/]'; $comment = "/<!--(.*?)(-->|\\z)/is"; $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackArmorCommentEntities'), $html); $html = preg_replace("/<({$char})/i", '<\\1', $html); $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackUndoCommentSubst'), $html); // fix comments } // preprocess html, essential for UTF-8 $html = $this->wrapHTML($html, $config, $context); $doc = new DOMDocument(); $doc->encoding = 'UTF-8'; // theoretically, the above has this covered set_error_handler(array($this, 'muteErrorHandler')); $doc->loadHTML($html); restore_error_handler(); $tokens = array(); $this->tokenizeDOM($doc->getElementsByTagName('html')->item(0)->getElementsByTagName('body')->item(0)->getElementsByTagName('div')->item(0), $tokens); return $tokens; }