/** * Create a HTMLDoc object * @param string $html The HTML to parse */ public function __construct($html) { $this->dom = new \DOMDocument(); libxml_use_internal_errors(true); $this->dom->loadHTML($html); $this->xp = new \DOMXPath($this->dom); }
/** * Constructor * * @param string $html * @return void */ public function __construct($html) { libxml_use_internal_errors(true); $this->_document = new DomDocument(); $this->_document->preserveWhiteSpace = false; $this->_document->loadHTML($html); libxml_use_internal_errors(false); }
/** * Load HTML document by using a DomDocument instance or return false on failure. * * @static * @access public * @param string $input XML content * @return DOMDocument */ public static function getHtmlDocument($input) { $dom = new DomDocument(); if (empty($input)) { return $dom; } libxml_use_internal_errors(true); if (version_compare(PHP_VERSION, '5.4.0', '>=')) { $dom->loadHTML($input, LIBXML_NONET); } else { $dom->loadHTML($input); } return $dom; }
function get_body_length($body) { $string = trim($body); // DomDocument doesn't like empty strings if (!strlen($string)) { return 0; } // We need to get rid of hidden tags (display: none) // Get rid of the warning. It would be better to have some valid html as input $dom = @DomDocument::loadHTML($body); $xpath = new DOMXPath($dom); /* * Checking any possible syntax of the style attribute with xpath is impossible * So we just get any element with a style attribute, and check them with a regexp */ $xr = $xpath->query('//*[@style]'); foreach ($xr as $node) { if (preg_match('/.*display: *none *;.*/', $node->getAttribute('style'))) { // Hidden, remove it from its parent $node->parentNode->removeChild($node); } } // Now we can get the body of our HTML DomDocument, it contains only what is visible $string = $dom->saveHTML(); $string = strip_tags($string); return strlen($string); }
public function getPreview($elements) { if (!isset($this->preview)) { if (!isset($elements)) { $elements = 2; } // Get just the text (no markup) from a node using $node->textContent. // Compare the textContent value to the one returned by $node->nodeValue. libxml_use_internal_errors(true); $dom = new DomDocument(); $dom->preserveWhiteSpace = false; $dom->loadHTML('<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>' . $this->Body . '</body></html>'); $dom->normalize(); $nodes = $dom->getElementsByTagName("body")->item(0)->childNodes; $elementCount = 0; $this->preview = ''; foreach ($nodes as $node) { if ($node->nodeType === XML_ELEMENT_NODE) { $this->preview .= $dom->saveXML($node); $elementCount++; if ($elementCount === $elements) { break; } } } // Carriage returns in the XML prevent the markup from validating. -- cwells $this->preview = str_replace(' ', '', $this->preview); } return $this->preview; }
public function truncatehtml($html, $minimum) { $oldDocument = new \DomDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); $oldDocument->loadHTML('<div>' . $html . '</div>'); // remove DOCTYPE, HTML and BODY tags $oldDocument->removeChild($oldDocument->firstChild); $oldDocument->replaceChild($oldDocument->firstChild->firstChild->firstChild, $oldDocument->firstChild); $currentLength = 0; // displayed text length (without markup) $newDocument = new \DomDocument(); foreach ($oldDocument->documentElement->childNodes as $node) { if ($node->nodeType != 3) { // not text node $imported = $newDocument->importNode($node, true); $newDocument->appendChild($imported); // copy original node to output document $currentLength += strlen(html_entity_decode($imported->nodeValue)); if ($currentLength >= $minimum) { // check if the minimum is reached break; } } } $output = $newDocument->saveHTML(); return html_entity_decode($output); }
/** * @return $this */ public function initializeDomDocument() { $doc = new \DomDocument(); $doc->loadHTML($this->html); $this->getPage()->setDocument($doc); return $this; }
public static function getMetaTags($url) { $rc = null; try { $settings[CURLOPT_URL] = $url; $contents = self::runCurl($settings); if (!empty($contents)) { libxml_use_internal_errors(true); $doc = new \DomDocument(); $doc->loadHTML($contents); $metas = $doc->getElementsByTagName('meta'); $rc = array(); foreach ($metas as $meta) { $name = $meta->getAttribute('name'); if (empty($name)) { $name = $meta->getAttribute('property'); } $content = $meta->getAttribute('content'); if (empty($content)) { $content = $meta->getAttribute('value'); } if (!empty($name) && !empty($content)) { $rc[$name] = $content; } } } return $rc; } catch (Exception $e) { return $rc; } }
public function fillInHtml($html, $formName, $formId, $values) { $dom = new DomDocument('1.0', sfConfig::get('sf_charset', 'UTF-8')); @$dom->loadHTML($html); $dom = $this->fillInDom($dom, $formName, $formId, $values); return $dom->saveHTML(); }
/** * * @param string $html */ public function exec($html) { mb_language('Japanese'); // 1.プリプロセス // scriptテキスト削除 // script内に文字列リテラルの閉じタグがあるとDomDocumentがscriptのソースを#text扱いしてしまうので // script内の文字を削除する // 正規表現で削除しようとするとSegmentation faultが発生する(StackOverFlow?)ので // simple_html_domでscript内文字列を削除 // MAX_FILE_SIZEの制限にひっかかったので、ソースを編集してデフォルトの3倍に変更している $simpleHtml = str_get_html($html); foreach ($simpleHtml->find('script') as $script) { $script->innertext = ''; } $html = $simpleHtml->outertext; // トリム // $html = preg_replace('/(\s| )+/mi', ' ', $html); // 2. dom生成 $doc = new DomDocument("1.0", "utf-8"); @$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); $node = $doc->getElementsByTagName('body')->item(0); $this->preProcessedInput = $node->textContent; // 3.プロパティを初期化 $this->domXPath = new DomXPath($doc); $this->title = @$doc->getElementsByTagName('title')->item(0)->textContent; $text = $this->scan($node); $this->textAll = $text; $this->domCountAll = $this->domCount; $this->pancutuationCountAll = $this->calcKutenScore($text) + $this->calcTotenScore($text); $this->textLengthAll = mb_strlen($text); $this->highScore = -1000000; $this->extracedNode = null; // 4.実行 $this->extract($node); }
function returnXPathObject($item) { $xmlPageDom = new DomDocument(); @$xmlPageDom->loadHTML($item); $xmlPageXPath = new DOMXPath($xmlPageDom); return $xmlPageXPath; }
function hal_parse($url) { $url = trim(html_entity_decode($url), "\"' "); $infos = parse_url($url); $ip = gethostbyname($infos['host']); if ($ip != '193.48.96.10') { spip_log("Url invalid", _LOG_ERREUR); return; } spip_log(sprintf("[hal_parse] init_http(%s)", $url), _LOG_DEBUG); $content = recuperer_page($url); spip_log(sprintf("[hal_parse] init_http(%s): Done", $url), _LOG_DEBUG); $dom = new DomDocument('1.0', 'UTF-8'); $dom->preserveWhiteSpace = false; $str = mb_convert_encoding($content, "HTML-ENTITIES"); @$dom->loadHTML($str); $xpath = new DOMXpath($dom); $entries = $xpath->query('//div[@id="res_script"]'); if ($entries->length == 0) { spip_log("No tag found ...", _LOG_ERREUR); return; } $res_script = $dom->saveXML($entries->item(0)); return $res_script; }
/** * * Enter description here ... * @param unknown_type $url */ public static function getProductComments($url) { header('Content-type: text/html; charset=utf-8'); $url = "http://ormatek.com/products/980"; $doc = file_get_contents($url); $doc = mb_convert_encoding($doc, 'HTML-ENTITIES', "UTF-8"); $query = ".//*[@class='comment']"; $dom = new DomDocument(); libxml_use_internal_errors(true); $dom->loadHTML($doc); $xpath = new DomXPath($dom); $nodes = $xpath->query($query); $i = 0; if (!is_array($nodes)) { return null; } foreach ($nodes as $node) { $name = $node->getElementsByTagName("b")->item(0)->nodeValue; $text = $node->getElementsByTagName("p")->item(0)->nodeValue; $date = $node->getElementsByTagName("span")->item(0)->nodeValue; $rating = 0; $i++; $param[] = array('id' => $i, 'name' => $name, 'date' => $date, 'rating' => $rating, 'text' => $text); } return $param; }
function capi_mkfeedtitle($feed) { global $_SGLOBAL, $_SN, $_SCONFIG; $feed['title_data'] = empty($feed['title_data']) ? array() : unserialize($feed['title_data']); if (!is_array($feed['title_data'])) { $feed['title_data'] = array(); } //title $searchs = $replaces = array(); if ($feed['title_data'] && is_array($feed['title_data'])) { foreach (array_keys($feed['title_data']) as $key) { if ($key === "touser") { $dom = new DomDocument(); @$dom->loadHTML($feed["title_data"]["touser"]); $urls = $dom->getElementsByTagName('a'); $url = $urls->item(0); $value["title_data"]["touser"] = capi_fhtml($value["title_data"]["touser"]); } $searchs[] = '{' . $key . '}'; $replaces[] = $feed['title_data'][$key]; } } $searchs[] = '{actor}'; $replaces[] = empty($actors) ? $_SN[$feed['uid']] : implode(lang('dot'), $actors); $feed['title_template'] = mktarget(str_replace($searchs, $replaces, $feed['title_template'])); return $feed; }
function __construct($url, $response, $browser) { $this->url = $url; $this->html = $response; $this->parseResponse($response); $this->is_xml = isset($this->headers['Content-Type']) && preg_match('/\\bxml\\b/i', $this->headers['Content-Type']) ? true : false; $this->browser = $browser; $this->dom = new DOMDocument(); if ($this->is_xml) { @$this->dom->loadXML($this->html); } else { @$this->dom->loadHTML($this->html); } $this->xpath = new DOMXPath($this->dom); $this->title = ($node = $this->xpath->query('//title')->item(0)) ? $node->nodeValue : ''; $this->forms = array(); foreach ($this->xpath->query('//form') as $form) { $this->_forms[] = new PGForm($form, $this); } if ($browser->convertUrls) { $this->convertUrls(); } $this->setParser($this->html, $this->is_xml); if (function_exists('gc_collect_cycles')) { gc_collect_cycles(); } }
public function process($args) { $answer = ""; $args = trim($args); if (strlen($args) > 0) { $entrada = urlencode($args); $url = "https://www.google.com.mx/search?q={$entrada}&oq=200&aqs=chrome.1.69i57j69i59j69i65l2j0l2.3015j0j8&client=ubuntu-browser&sourceid=chrome&es_sm=122&ie=UTF-8"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/37.0.2062.120 Chrome/37.0.2062.120 Safari/537.36'); $html = curl_exec($ch); $web = new DomDocument(); @$web->loadHTML($html); $nodos = @$web->getElementById('topstuff')->getElementsByTagName('div'); $answer = "No pude convertir lo que me pides."; if ($nodos) { $nodos = iterator_to_array($nodos); if (count($nodos) === 6) { $answer = utf8_decode($nodos[3]->nodeValue . " " . $nodos[4]->nodeValue); } } } else { $answer = "Ingresa una expresion."; } $this->reply($answer, $this->currentchannel, $this->nick); }
/** * @param string $html * * @return Result */ public function parse($html) { $doc = new \DomDocument(); @$doc->loadHTML($html); $this->xpath = new \DomXpath($doc); return new Result($this->parseLinks(), $this->parseTotalIndex()); }
public function getForm($formId, $params, $headers) { /** @var DOMElement $form */ $dom = new DomDocument(); libxml_use_internal_errors(true); $dom->loadHTML($this->response()); $xpath = new DOMXpath($dom); $form = $xpath->query("//form[@id='{$formId}']")->item(0); $elements = $xpath->query('//input'); $form_params = []; $allowedTypes = ["hidden", "text", "password"]; foreach ($elements as $element) { /** @var DOMElement $element */ $type = $element->getAttribute("type"); if (in_array($type, $allowedTypes)) { $name = $element->getAttribute("name"); $value = $element->getAttribute("value"); $form_params[$name] = $value; } } $headers = array_merge(["Referer" => $this->baseUri], $headers); $url = Uri::resolve(new Uri($this->baseUri), $form->getAttribute("action"))->__toString(); $method = strtoupper($form->getAttribute("method")); return ["method" => $method, "url" => $url, "headers" => $headers, "params" => array_merge($form_params, $params)]; }
function get_input_tags($html) { $post_data = array(); // a new dom object $dom = new DomDocument(); //load the html into the object $dom->loadHTML($html); //discard white space $dom->preserveWhiteSpace = false; //all input tags as a list $input_tags = $dom->getElementsByTagName('input'); //get all rows from the table for ($i = 0; $i < $input_tags->length; $i++) { if (is_object($input_tags->item($i))) { $name = $value = ''; $name_o = $input_tags->item($i)->attributes->getNamedItem('name'); if (is_object($name_o)) { $name = $name_o->value; $value_o = $input_tags->item($i)->attributes->getNamedItem('value'); if (is_object($value_o)) { $value = $input_tags->item($i)->attributes->getNamedItem('value')->value; } $post_data[$name] = $value; } } } return $post_data; }
/** * Return the XML for a * specified URL. * Static for re-usability. * @param $url * @return SimpleXMLElement */ public function getXMLFromURL($url) { $html = $this->getHTMLFromURL($url); $doc = new DomDocument(); @$doc->loadHTML($html); $xml = simplexml_import_dom($doc); return $xml; }
/** * This method creates a xpath object from a given url and stores them in * curXpathObj. * * @param string $html the html of a website in UTF8 * @return DOMXPath */ public static function load_xpath($html) { libxml_use_internal_errors(true); $doc = new DomDocument(); $utf8_data = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $doc->loadHTML($utf8_data); return new DOMXPath($doc); }
function getFollowers($username){ $x = file_get_contents("http://twitter.com/".$username); $doc = new DomDocument; @$doc->loadHTML($x); $ele = $doc->getElementById('follower_count'); $innerHTML=preg_replace('/^<[^>]*>(.*)<[^>]*>$/',"\\1",DOMElement_getOuterHTML($doc,$ele)); return $innerHTML; }
private function getSimpleXml($html) { \libxml_use_internal_errors(true); $dom = new \DomDocument('1.0', 'UTF-8'); $dom->strictErrorChecking = false; $dom->loadHTML($html); \libxml_use_internal_errors(false); return \simplexml_import_dom($dom); }
public static function slurpHtml($html) { $dom = new DomDocument(); $dom->strictErrorChecking = false; $dom->recover = true; // TODO: Any way around error suppression? @$dom->loadHTML($html); return $dom; }
/** * @param string HTML * @return \DomDocument object */ private function getDomDocument($data) { $dom = new \DomDocument(); if ($dom->loadHTML($data)) { return $dom; } else { die("Page structure is incorrect"); } }
private function getDomFromHtml($html) { $document = new \DomDocument('1.0', 'UTF-8'); libxml_use_internal_errors(true); $document->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NODEFDTD | LIBXML_NOENT); $document->encoding = 'UTF-8'; $document->preserveWhiteSpace = false; return $document; }
/** * Test that all unit prices are returned in decimal format from $testPageSource. */ public function testRetrieveNodeValuesFromDomWithClean() { $domDocument = new DomDocument(); @$domDocument->loadHTML($this->testPageSource); $titleXPath = 'id(\'productInfo\')/ul/li[@class="unitPrice"]'; $nodeValues = WebScraper::retrieveNodeValuesFromDom($domDocument, $titleXPath, '/[^0-9\\.]/'); $this->assertEquals($nodeValues[0], "1.00"); $this->assertEquals($nodeValues[1], "2.00"); $this->assertEquals($nodeValues[2], "3.00"); }
/** * * @param $strHtml - * @param $strSelector - * * @return * * **/ function getByXPathExpression($strHtml, $strSelector) { if (true === empty($strHtml) || true === empty($strSelector)) { return null; } $objDomDocument = new \DomDocument(); @$objDomDocument->loadHTML($str . $strHtml); $objDomXPath = new \DOMXPath($objDomDocument); return $objDomXPath->query($strSelector); }
private function getGEDTFormTokens() { //Extract the validation data libxml_use_internal_errors(true); //skip DOM errors $dom = new DomDocument(); $dom->loadHTML($this->upload_form); $this->form_objects['__VIEWSTATE'] = $dom->getElementById('__VIEWSTATE')->getAttribute('value'); $this->form_objects['__EVENTVALIDATION'] = $dom->getElementById('__EVENTVALIDATION')->getAttribute('value'); }
protected function compile() { $this->templateDocument = new DOMDocument(); if (!empty($this->templateString)) { libxml_use_internal_errors(true); $this->templateDocument->loadHTML($this->templateString, LIBXML_HTML_NODEFDTD); libxml_use_internal_errors(false); } $this->templateXPath = new DOMXPath($this->templateDocument); if ($this->templateDocument->hasChildNodes()) { foreach ($this->templateDocument->childNodes as $node) { $this->compileNode($node, $this->templateDocument); } $removals = $this->templateXPath->query('//*[@delete="1"]'); foreach ($removals as $remove) { $remove->parentNode->removeChild($remove); } } }