Beispiel #1
0
 /**
  * Create a HTMLDoc object
  * @param string $html The HTML to parse
  */
 public function __construct($html)
 {
     $this->dom = new \DOMDocument();
     libxml_use_internal_errors(true);
     $this->dom->loadHTML($html);
     $this->xp = new \DOMXPath($this->dom);
 }
Beispiel #2
0
 /**
  * Constructor
  * 
  * @param string $html
  * @return void
  */
 public function __construct($html)
 {
     libxml_use_internal_errors(true);
     $this->_document = new DomDocument();
     $this->_document->preserveWhiteSpace = false;
     $this->_document->loadHTML($html);
     libxml_use_internal_errors(false);
 }
Beispiel #3
0
 /**
  * Load HTML document by using a DomDocument instance or return false on failure.
  *
  * @static
  * @access public
  * @param  string $input XML content
  * @return DOMDocument
  */
 public static function getHtmlDocument($input)
 {
     $dom = new DomDocument();
     if (empty($input)) {
         return $dom;
     }
     libxml_use_internal_errors(true);
     if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
         $dom->loadHTML($input, LIBXML_NONET);
     } else {
         $dom->loadHTML($input);
     }
     return $dom;
 }
function get_body_length($body)
{
    $string = trim($body);
    // DomDocument doesn't like empty strings
    if (!strlen($string)) {
        return 0;
    }
    // We need to get rid of hidden tags (display: none)
    // Get rid of the warning. It would be better to have some valid html as input
    $dom = @DomDocument::loadHTML($body);
    $xpath = new DOMXPath($dom);
    /*
     * Checking any possible syntax of the style attribute with xpath is impossible
     * So we just get any element with a style attribute, and check them with a regexp
     */
    $xr = $xpath->query('//*[@style]');
    foreach ($xr as $node) {
        if (preg_match('/.*display: *none *;.*/', $node->getAttribute('style'))) {
            // Hidden, remove it from its parent
            $node->parentNode->removeChild($node);
        }
    }
    // Now we can get the body of our HTML DomDocument, it contains only what is visible
    $string = $dom->saveHTML();
    $string = strip_tags($string);
    return strlen($string);
}
Beispiel #5
0
 public function getPreview($elements)
 {
     if (!isset($this->preview)) {
         if (!isset($elements)) {
             $elements = 2;
         }
         // Get just the text (no markup) from a node using $node->textContent.
         // Compare the textContent value to the one returned by $node->nodeValue.
         libxml_use_internal_errors(true);
         $dom = new DomDocument();
         $dom->preserveWhiteSpace = false;
         $dom->loadHTML('<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>' . $this->Body . '</body></html>');
         $dom->normalize();
         $nodes = $dom->getElementsByTagName("body")->item(0)->childNodes;
         $elementCount = 0;
         $this->preview = '';
         foreach ($nodes as $node) {
             if ($node->nodeType === XML_ELEMENT_NODE) {
                 $this->preview .= $dom->saveXML($node);
                 $elementCount++;
                 if ($elementCount === $elements) {
                     break;
                 }
             }
         }
         // Carriage returns in the XML prevent the markup from validating. -- cwells
         $this->preview = str_replace('&#13;', '', $this->preview);
     }
     return $this->preview;
 }
 public function truncatehtml($html, $minimum)
 {
     $oldDocument = new \DomDocument();
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
     $oldDocument->loadHTML('<div>' . $html . '</div>');
     // remove DOCTYPE, HTML and BODY tags
     $oldDocument->removeChild($oldDocument->firstChild);
     $oldDocument->replaceChild($oldDocument->firstChild->firstChild->firstChild, $oldDocument->firstChild);
     $currentLength = 0;
     // displayed text length (without markup)
     $newDocument = new \DomDocument();
     foreach ($oldDocument->documentElement->childNodes as $node) {
         if ($node->nodeType != 3) {
             // not text node
             $imported = $newDocument->importNode($node, true);
             $newDocument->appendChild($imported);
             // copy original node to output document
             $currentLength += strlen(html_entity_decode($imported->nodeValue));
             if ($currentLength >= $minimum) {
                 // check if the minimum is reached
                 break;
             }
         }
     }
     $output = $newDocument->saveHTML();
     return html_entity_decode($output);
 }
 /**
  * @return $this
  */
 public function initializeDomDocument()
 {
     $doc = new \DomDocument();
     $doc->loadHTML($this->html);
     $this->getPage()->setDocument($doc);
     return $this;
 }
Beispiel #8
0
 public static function getMetaTags($url)
 {
     $rc = null;
     try {
         $settings[CURLOPT_URL] = $url;
         $contents = self::runCurl($settings);
         if (!empty($contents)) {
             libxml_use_internal_errors(true);
             $doc = new \DomDocument();
             $doc->loadHTML($contents);
             $metas = $doc->getElementsByTagName('meta');
             $rc = array();
             foreach ($metas as $meta) {
                 $name = $meta->getAttribute('name');
                 if (empty($name)) {
                     $name = $meta->getAttribute('property');
                 }
                 $content = $meta->getAttribute('content');
                 if (empty($content)) {
                     $content = $meta->getAttribute('value');
                 }
                 if (!empty($name) && !empty($content)) {
                     $rc[$name] = $content;
                 }
             }
         }
         return $rc;
     } catch (Exception $e) {
         return $rc;
     }
 }
Beispiel #9
0
 public function fillInHtml($html, $formName, $formId, $values)
 {
     $dom = new DomDocument('1.0', sfConfig::get('sf_charset', 'UTF-8'));
     @$dom->loadHTML($html);
     $dom = $this->fillInDom($dom, $formName, $formId, $values);
     return $dom->saveHTML();
 }
 /**
  * 
  * @param string $html
  */
 public function exec($html)
 {
     mb_language('Japanese');
     // 1.プリプロセス
     // scriptテキスト削除
     // script内に文字列リテラルの閉じタグがあるとDomDocumentがscriptのソースを#text扱いしてしまうので
     // script内の文字を削除する
     // 正規表現で削除しようとするとSegmentation faultが発生する(StackOverFlow?)ので
     // simple_html_domでscript内文字列を削除
     // MAX_FILE_SIZEの制限にひっかかったので、ソースを編集してデフォルトの3倍に変更している
     $simpleHtml = str_get_html($html);
     foreach ($simpleHtml->find('script') as $script) {
         $script->innertext = '';
     }
     $html = $simpleHtml->outertext;
     // トリム
     //		$html = preg_replace('/(\s| )+/mi', ' ', $html);
     // 2. dom生成
     $doc = new DomDocument("1.0", "utf-8");
     @$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
     $node = $doc->getElementsByTagName('body')->item(0);
     $this->preProcessedInput = $node->textContent;
     // 3.プロパティを初期化
     $this->domXPath = new DomXPath($doc);
     $this->title = @$doc->getElementsByTagName('title')->item(0)->textContent;
     $text = $this->scan($node);
     $this->textAll = $text;
     $this->domCountAll = $this->domCount;
     $this->pancutuationCountAll = $this->calcKutenScore($text) + $this->calcTotenScore($text);
     $this->textLengthAll = mb_strlen($text);
     $this->highScore = -1000000;
     $this->extracedNode = null;
     // 4.実行
     $this->extract($node);
 }
Beispiel #11
0
function returnXPathObject($item)
{
    $xmlPageDom = new DomDocument();
    @$xmlPageDom->loadHTML($item);
    $xmlPageXPath = new DOMXPath($xmlPageDom);
    return $xmlPageXPath;
}
function hal_parse($url)
{
    $url = trim(html_entity_decode($url), "\"' ");
    $infos = parse_url($url);
    $ip = gethostbyname($infos['host']);
    if ($ip != '193.48.96.10') {
        spip_log("Url invalid", _LOG_ERREUR);
        return;
    }
    spip_log(sprintf("[hal_parse] init_http(%s)", $url), _LOG_DEBUG);
    $content = recuperer_page($url);
    spip_log(sprintf("[hal_parse] init_http(%s): Done", $url), _LOG_DEBUG);
    $dom = new DomDocument('1.0', 'UTF-8');
    $dom->preserveWhiteSpace = false;
    $str = mb_convert_encoding($content, "HTML-ENTITIES");
    @$dom->loadHTML($str);
    $xpath = new DOMXpath($dom);
    $entries = $xpath->query('//div[@id="res_script"]');
    if ($entries->length == 0) {
        spip_log("No tag found ...", _LOG_ERREUR);
        return;
    }
    $res_script = $dom->saveXML($entries->item(0));
    return $res_script;
}
Beispiel #13
0
 /**
  * 
  * Enter description here ...
  * @param unknown_type $url
  */
 public static function getProductComments($url)
 {
     header('Content-type: text/html; charset=utf-8');
     $url = "http://ormatek.com/products/980";
     $doc = file_get_contents($url);
     $doc = mb_convert_encoding($doc, 'HTML-ENTITIES', "UTF-8");
     $query = ".//*[@class='comment']";
     $dom = new DomDocument();
     libxml_use_internal_errors(true);
     $dom->loadHTML($doc);
     $xpath = new DomXPath($dom);
     $nodes = $xpath->query($query);
     $i = 0;
     if (!is_array($nodes)) {
         return null;
     }
     foreach ($nodes as $node) {
         $name = $node->getElementsByTagName("b")->item(0)->nodeValue;
         $text = $node->getElementsByTagName("p")->item(0)->nodeValue;
         $date = $node->getElementsByTagName("span")->item(0)->nodeValue;
         $rating = 0;
         $i++;
         $param[] = array('id' => $i, 'name' => $name, 'date' => $date, 'rating' => $rating, 'text' => $text);
     }
     return $param;
 }
Beispiel #14
0
function capi_mkfeedtitle($feed)
{
    global $_SGLOBAL, $_SN, $_SCONFIG;
    $feed['title_data'] = empty($feed['title_data']) ? array() : unserialize($feed['title_data']);
    if (!is_array($feed['title_data'])) {
        $feed['title_data'] = array();
    }
    //title
    $searchs = $replaces = array();
    if ($feed['title_data'] && is_array($feed['title_data'])) {
        foreach (array_keys($feed['title_data']) as $key) {
            if ($key === "touser") {
                $dom = new DomDocument();
                @$dom->loadHTML($feed["title_data"]["touser"]);
                $urls = $dom->getElementsByTagName('a');
                $url = $urls->item(0);
                $value["title_data"]["touser"] = capi_fhtml($value["title_data"]["touser"]);
            }
            $searchs[] = '{' . $key . '}';
            $replaces[] = $feed['title_data'][$key];
        }
    }
    $searchs[] = '{actor}';
    $replaces[] = empty($actors) ? $_SN[$feed['uid']] : implode(lang('dot'), $actors);
    $feed['title_template'] = mktarget(str_replace($searchs, $replaces, $feed['title_template']));
    return $feed;
}
Beispiel #15
0
 function __construct($url, $response, $browser)
 {
     $this->url = $url;
     $this->html = $response;
     $this->parseResponse($response);
     $this->is_xml = isset($this->headers['Content-Type']) && preg_match('/\\bxml\\b/i', $this->headers['Content-Type']) ? true : false;
     $this->browser = $browser;
     $this->dom = new DOMDocument();
     if ($this->is_xml) {
         @$this->dom->loadXML($this->html);
     } else {
         @$this->dom->loadHTML($this->html);
     }
     $this->xpath = new DOMXPath($this->dom);
     $this->title = ($node = $this->xpath->query('//title')->item(0)) ? $node->nodeValue : '';
     $this->forms = array();
     foreach ($this->xpath->query('//form') as $form) {
         $this->_forms[] = new PGForm($form, $this);
     }
     if ($browser->convertUrls) {
         $this->convertUrls();
     }
     $this->setParser($this->html, $this->is_xml);
     if (function_exists('gc_collect_cycles')) {
         gc_collect_cycles();
     }
 }
Beispiel #16
0
 public function process($args)
 {
     $answer = "";
     $args = trim($args);
     if (strlen($args) > 0) {
         $entrada = urlencode($args);
         $url = "https://www.google.com.mx/search?q={$entrada}&oq=200&aqs=chrome.1.69i57j69i59j69i65l2j0l2.3015j0j8&client=ubuntu-browser&sourceid=chrome&es_sm=122&ie=UTF-8";
         $ch = curl_init();
         curl_setopt($ch, CURLOPT_URL, $url);
         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
         curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/37.0.2062.120 Chrome/37.0.2062.120 Safari/537.36');
         $html = curl_exec($ch);
         $web = new DomDocument();
         @$web->loadHTML($html);
         $nodos = @$web->getElementById('topstuff')->getElementsByTagName('div');
         $answer = "No pude convertir lo que me pides.";
         if ($nodos) {
             $nodos = iterator_to_array($nodos);
             if (count($nodos) === 6) {
                 $answer = utf8_decode($nodos[3]->nodeValue . " " . $nodos[4]->nodeValue);
             }
         }
     } else {
         $answer = "Ingresa una expresion.";
     }
     $this->reply($answer, $this->currentchannel, $this->nick);
 }
Beispiel #17
0
 /**
  * @param string $html
  *
  * @return Result
  */
 public function parse($html)
 {
     $doc = new \DomDocument();
     @$doc->loadHTML($html);
     $this->xpath = new \DomXpath($doc);
     return new Result($this->parseLinks(), $this->parseTotalIndex());
 }
Beispiel #18
0
 public function getForm($formId, $params, $headers)
 {
     /** @var DOMElement $form */
     $dom = new DomDocument();
     libxml_use_internal_errors(true);
     $dom->loadHTML($this->response());
     $xpath = new DOMXpath($dom);
     $form = $xpath->query("//form[@id='{$formId}']")->item(0);
     $elements = $xpath->query('//input');
     $form_params = [];
     $allowedTypes = ["hidden", "text", "password"];
     foreach ($elements as $element) {
         /** @var DOMElement $element */
         $type = $element->getAttribute("type");
         if (in_array($type, $allowedTypes)) {
             $name = $element->getAttribute("name");
             $value = $element->getAttribute("value");
             $form_params[$name] = $value;
         }
     }
     $headers = array_merge(["Referer" => $this->baseUri], $headers);
     $url = Uri::resolve(new Uri($this->baseUri), $form->getAttribute("action"))->__toString();
     $method = strtoupper($form->getAttribute("method"));
     return ["method" => $method, "url" => $url, "headers" => $headers, "params" => array_merge($form_params, $params)];
 }
Beispiel #19
0
 function get_input_tags($html)
 {
     $post_data = array();
     // a new dom object
     $dom = new DomDocument();
     //load the html into the object
     $dom->loadHTML($html);
     //discard white space
     $dom->preserveWhiteSpace = false;
     //all input tags as a list
     $input_tags = $dom->getElementsByTagName('input');
     //get all rows from the table
     for ($i = 0; $i < $input_tags->length; $i++) {
         if (is_object($input_tags->item($i))) {
             $name = $value = '';
             $name_o = $input_tags->item($i)->attributes->getNamedItem('name');
             if (is_object($name_o)) {
                 $name = $name_o->value;
                 $value_o = $input_tags->item($i)->attributes->getNamedItem('value');
                 if (is_object($value_o)) {
                     $value = $input_tags->item($i)->attributes->getNamedItem('value')->value;
                 }
                 $post_data[$name] = $value;
             }
         }
     }
     return $post_data;
 }
 /**
  * Return the XML for a
  * specified URL.
  * Static for re-usability.
  * @param $url
  * @return SimpleXMLElement
  */
 public function getXMLFromURL($url)
 {
     $html = $this->getHTMLFromURL($url);
     $doc = new DomDocument();
     @$doc->loadHTML($html);
     $xml = simplexml_import_dom($doc);
     return $xml;
 }
 /**
  * This method creates a xpath object from a given url and stores them in
  * curXpathObj.
  *
  * @param string $html the html of a website in UTF8
  * @return DOMXPath
  */
 public static function load_xpath($html)
 {
     libxml_use_internal_errors(true);
     $doc = new DomDocument();
     $utf8_data = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     $doc->loadHTML($utf8_data);
     return new DOMXPath($doc);
 }
Beispiel #22
0
function getFollowers($username){
  $x = file_get_contents("http://twitter.com/".$username);
  $doc = new DomDocument;
  @$doc->loadHTML($x);
  $ele = $doc->getElementById('follower_count');
  $innerHTML=preg_replace('/^<[^>]*>(.*)<[^>]*>$/',"\\1",DOMElement_getOuterHTML($doc,$ele));
  return $innerHTML;
}
Beispiel #23
0
 private function getSimpleXml($html)
 {
     \libxml_use_internal_errors(true);
     $dom = new \DomDocument('1.0', 'UTF-8');
     $dom->strictErrorChecking = false;
     $dom->loadHTML($html);
     \libxml_use_internal_errors(false);
     return \simplexml_import_dom($dom);
 }
 public static function slurpHtml($html)
 {
     $dom = new DomDocument();
     $dom->strictErrorChecking = false;
     $dom->recover = true;
     // TODO: Any way around error suppression?
     @$dom->loadHTML($html);
     return $dom;
 }
Beispiel #25
0
 /**
  * @param string HTML
  * @return \DomDocument object
  */
 private function getDomDocument($data)
 {
     $dom = new \DomDocument();
     if ($dom->loadHTML($data)) {
         return $dom;
     } else {
         die("Page structure is incorrect");
     }
 }
 private function getDomFromHtml($html)
 {
     $document = new \DomDocument('1.0', 'UTF-8');
     libxml_use_internal_errors(true);
     $document->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NODEFDTD | LIBXML_NOENT);
     $document->encoding = 'UTF-8';
     $document->preserveWhiteSpace = false;
     return $document;
 }
Beispiel #27
0
 /**
  * Test that all unit prices are returned in decimal format from $testPageSource.
  */
 public function testRetrieveNodeValuesFromDomWithClean()
 {
     $domDocument = new DomDocument();
     @$domDocument->loadHTML($this->testPageSource);
     $titleXPath = 'id(\'productInfo\')/ul/li[@class="unitPrice"]';
     $nodeValues = WebScraper::retrieveNodeValuesFromDom($domDocument, $titleXPath, '/[^0-9\\.]/');
     $this->assertEquals($nodeValues[0], "1.00");
     $this->assertEquals($nodeValues[1], "2.00");
     $this->assertEquals($nodeValues[2], "3.00");
 }
/**
 *
 * @param $strHtml     - 
 * @param $strSelector - 
 *
 * @return 
 *
 *
 **/
function getByXPathExpression($strHtml, $strSelector)
{
    if (true === empty($strHtml) || true === empty($strSelector)) {
        return null;
    }
    $objDomDocument = new \DomDocument();
    @$objDomDocument->loadHTML($str . $strHtml);
    $objDomXPath = new \DOMXPath($objDomDocument);
    return $objDomXPath->query($strSelector);
}
Beispiel #29
0
 private function getGEDTFormTokens()
 {
     //Extract the validation data
     libxml_use_internal_errors(true);
     //skip DOM errors
     $dom = new DomDocument();
     $dom->loadHTML($this->upload_form);
     $this->form_objects['__VIEWSTATE'] = $dom->getElementById('__VIEWSTATE')->getAttribute('value');
     $this->form_objects['__EVENTVALIDATION'] = $dom->getElementById('__EVENTVALIDATION')->getAttribute('value');
 }
Beispiel #30
0
 protected function compile()
 {
     $this->templateDocument = new DOMDocument();
     if (!empty($this->templateString)) {
         libxml_use_internal_errors(true);
         $this->templateDocument->loadHTML($this->templateString, LIBXML_HTML_NODEFDTD);
         libxml_use_internal_errors(false);
     }
     $this->templateXPath = new DOMXPath($this->templateDocument);
     if ($this->templateDocument->hasChildNodes()) {
         foreach ($this->templateDocument->childNodes as $node) {
             $this->compileNode($node, $this->templateDocument);
         }
         $removals = $this->templateXPath->query('//*[@delete="1"]');
         foreach ($removals as $remove) {
             $remove->parentNode->removeChild($remove);
         }
     }
 }