Example #1
1
 function _testXPath($xpath_expression)
 {
     if (!class_exists('DOMDocument') || !class_exists('DOMXPath')) {
         if (function_exists('domxml_open_mem')) {
             $dom = domxml_open_mem($this->_response);
             if (!$dom) {
                 $this->fail('Error parsing doc');
                 return false;
             }
             var_dump($dom);
             $xpath = $dom->xpath_init();
             var_dump($xpath);
             $ctx = $dom->xpath_new_context();
             var_dump($xpath_expression);
             $result = $ctx->xpath_eval($xpath_expression);
             var_dump($result);
             $return = new stdClass();
             $return->length = count($result->nodeset);
             return $return;
         }
         $this->fail('No xpath support built in');
         return false;
     } else {
         if (extension_loaded('domxml')) {
             $this->fail('Please disable the domxml extension. Only php5 builtin domxml is supported');
             return false;
         }
     }
     $dom = new DOMDocument();
     $dom->loadHtml($this->_response);
     $xpath = new DOMXPath($dom);
     $node = $xpath->query($xpath_expression);
     return $node;
 }
Example #2
0
 protected function __construct($url, $status, array $headers, $body)
 {
     parent::__construct($url, $status, $headers, $body);
     $this->_domDocument = new DOMDocument();
     $this->_domDocument->preserveWhiteSpace = true;
     // We have to silence this out because invalid documents
     // tend to throw allot of warnings
     @$this->_domDocument->loadHtml($body);
 }
Example #3
0
 /**
  * @param  string $html
  * @return \DiDom\Element
  * @throws \InvalidArgumentException
  */
 public function loadHtml($html)
 {
     if (!is_string($html)) {
         throw new InvalidArgumentException(sprintf('%s expects parameter 1 to be string, %s given', __METHOD__, is_object($html) ? get_class($html) : gettype($html)));
     }
     libxml_use_internal_errors(true);
     libxml_disable_entity_loader(true);
     $this->document->loadHtml($html);
     libxml_clear_errors();
     libxml_disable_entity_loader(false);
     libxml_use_internal_errors(false);
     return $this;
 }
function get_members($url)
{
    $html = get_html($url);
    if ($html === false) {
        echo 'connection error';
    } else {
        $oldSetting = libxml_use_internal_errors(true);
        libxml_clear_errors();
        $dom = new DOMDocument();
        $dom->loadHtml($html);
        $tbody = $dom->getElementsByTagName('tbody');
        $trs = $tbody[0]->getElementsByTagName('tr');
        global $parteinameFilter;
        $members = array();
        foreach ($trs as $tr) {
            $tds = $tr->getElementsByTagName('td');
            $link = $tds[0]->getElementsByTagName('a');
            $member = array('name' => $link[0]->nodeValue, 'link' => $link[0]->getAttribute('href'), 'partei' => str_replace($parteinameFilter, '', $tds[1]->nodeValue));
            $aze = str_replace(' ', '', htmlentities($tds[2]->nodeValue));
            if ($aze) {
                $member['amtszeitende'] = $aze;
            }
            $members[] = $member;
        }
        libxml_clear_errors();
        libxml_use_internal_errors($oldSetting);
        return $members;
    }
    return false;
}
function get_commitees($url)
{
    $html = get_html($url);
    if ($html === false) {
        echo 'connection error';
    } else {
        $oldSetting = libxml_use_internal_errors(true);
        libxml_clear_errors();
        $dom = new DOMDocument();
        $dom->loadHtml($html);
        $tbody = $dom->getElementsByTagName('tbody');
        $trs = $tbody[0]->getElementsByTagName('tr');
        $commitees = array();
        foreach ($trs as $tr) {
            $tds = $tr->getElementsByTagName('td');
            $link = $tds[0]->getElementsByTagName('a');
            if ($link->length > 0) {
                $commitee = array('name' => $link[0]->nodeValue, 'link' => $link[0]->getAttribute('href'));
            }
            $commitees[] = $commitee;
        }
        libxml_clear_errors();
        libxml_use_internal_errors($oldSetting);
        return $commitees;
    }
    return false;
}
Example #6
0
 static function fixChildrenAttribute($elementType, $name, $value)
 {
     var_dump('AbstractElement\\Helper::fixChildrenAttribute needs fixed');
     exit;
     $classPath = '\\AbstractElement\\' . $elementType;
     // each element in contents array for this object
     foreach ($this->contents as $index => $content) {
         // is this an object that extends AbstractElement?
         if (is_a($content, '\\Element')) {
             // is this of the right element type?
             if (is_a($content, $classPath)) {
                 $content->setAttribute($name, $value);
             }
             $content->fixChildrenAttribute($elementType, $name, $value);
         } elseif (is_string($content) && $fixRawHtml) {
             $dom = new \DOMDocument();
             $dom->loadHtml($content);
             $reflectionClass = new \ReflectionClass($classPath);
             $elements = $dom->getElementsByTagName($reflectionClass->getConstant('tag'));
             foreach ($elements as $element) {
                 $element->setAttribute($name, $value);
             }
             $this->contents[$index] = $dom->saveHTML();
         }
     }
 }
Example #7
0
 public static function uploadTextarea($texto, $tipo_midia)
 {
     $nomeTipo = TipoMidia::findOrFail($tipo_midia)->descricao;
     // gravando imagem do corpo da noticia
     $dom = new \DOMDocument();
     $dom->loadHtml($texto, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
     $images = $dom->getElementsByTagName('img');
     // foreach <img> in the submited message
     foreach ($images as $img) {
         $src = $img->getAttribute('src');
         // if the img source is 'data-url'
         if (preg_match('/data:image/', $src)) {
             // get the mimetype
             preg_match('/data:image\\/(?<mime>.*?)\\;/', $src, $groups);
             $mimetype = $groups['mime'];
             // Generating a random filename
             $filename = md5(uniqid());
             $filepath = "uploads/" . $nomeTipo . "/" . $filename . '.' . $mimetype;
             // @see http://image.intervention.io/api/
             $image = Image::make($src)->encode($mimetype, 100)->save(public_path($filepath));
             $new_src = asset($filepath);
             $img->removeAttribute('src');
             $img->setAttribute('src', $new_src);
         }
     }
     return $dom->saveHTML();
 }
function remove_link_tags($content)
{
    $old_xml_err = libxml_use_internal_errors(true);
    $dom = new DOMDocument();
    $dom->loadHtml(mb_convert_encoding($content, 'HTML-ENTITIES', "UTF-8"));
    foreach ($dom->getElementsByTagName('link') as $link) {
        $link->parentNode->removeChild($link);
    }
    $content_out = '';
    $node = $dom->firstChild;
    while ($node) {
        $content_out .= $dom->saveHTML($node);
        /* repeat for all nodes at this level */
        $node = $node->nextSibling;
    }
    foreach (libxml_get_errors() as $error) {
        /* just ignore warnings */
        if ($error->level === LIBXML_ERR_WARNING) {
            continue;
        }
        fof_log(__FUNCTION__ . ': ' . $error->message);
    }
    libxml_clear_errors();
    libxml_use_internal_errors($old_xml_err);
    return $content_out;
}
Example #9
0
 /**
  * Fetch conversation with 
  *
  * @access public
  * @var    string $gamertag
  * @var    string $region
  * @var    string $sender
  * @return array
  */
 public function fetchConversationWith($gamertag, $region, $sender)
 {
     $gamertag = trim($gamertag);
     $url = 'https://account.xbox.com/' . $region . '/Messages/UserConversation?senderGamerTag=' . $sender;
     $key = $this->version . ':getMessages.' . $gamertag;
     $data = $this->fetch_url($url);
     $doc = new DOMDocument();
     if (!empty($sender) && !empty($gamertag)) {
         $doc->loadHtml($data);
         $xpath = new DOMXPath($doc);
         $postThumbLinks = $xpath->query("//div[@class='messageContent']");
         $i = 0;
         $array = array();
         $last_sender = "";
         foreach ($postThumbLinks as $link) {
             $body = $this->find($link->ownerDocument->saveHTML($link), '<div class="messageBody">', '</div>');
             $time = $this->find($link->ownerDocument->saveHTML($link), '<div class="sentDate localTime">', '</div>');
             $sender = $this->find($link->ownerDocument->saveHTML($link), '<div class="senderGamertag">', '</div>');
             $array[$i]['message'] = $body;
             $array[$i]['time'] = $time;
             if ($sender) {
                 $array[$i]['sender'] = $sender;
                 $last_sender = $sender;
             } else {
                 $array[$i]['sender'] = $last_sender;
             }
             $i++;
         }
     } else {
         return false;
     }
     return $array;
 }
Example #10
0
 /**
  * Load HTML or XML.
  * 
  * @param string $string HTML or XML string or file path
  * @param bool   $isFile Indicates that in first parameter was passed to the file path
  * @param string $type Type of document
  * @param int    $options Additional parameters
  */
 public function load($string, $isFile = false, $type = 'html', $options = 0)
 {
     if (!is_string($string)) {
         throw new InvalidArgumentException(sprintf('%s expects parameter 1 to be string, %s given', __METHOD__, is_object($string) ? get_class($string) : gettype($string)));
     }
     if (!in_array(strtolower($type), ['xml', 'html'])) {
         throw new InvalidArgumentException(sprintf('Document type must be "xml" or "html", %s given', __METHOD__, is_object($type) ? get_class($type) : gettype($type)));
     }
     if (!is_integer($options)) {
         throw new InvalidArgumentException(sprintf('%s expects parameter 4 to be integer, %s given', __METHOD__, is_object($options) ? get_class($options) : gettype($options)));
     }
     $string = trim($string);
     if ($isFile) {
         $string = $this->loadFile($string);
     }
     if (substr($string, 0, 5) !== '<?xml') {
         $prolog = sprintf('<?xml version="1.0" encoding="%s"?>', $this->document->encoding);
         $string = $prolog . $string;
     }
     $this->type = strtolower($type);
     Errors::disable();
     $this->type === 'xml' ? $this->document->loadXml($string, $options) : $this->document->loadHtml($string, $options);
     Errors::restore();
     return $this;
 }
Example #11
0
 public static function fix($html)
 {
     if (empty($html)) {
         return $html;
     }
     $html = self::xss($html);
     if (substr($html, 0, 2) !== '<p') {
         $html = '<p>' . implode('</p><p>', preg_split('/[\\n\\r]/', $html)) . '</p>';
     }
     $html = trim(str_replace(["\n", "\r"], ' ', self::xss($html)));
     $html = preg_replace('#\\s{2,}#', ' ', $html);
     $valid = 'class|src|target|alt|title|href|rel';
     $html = preg_replace('#<(font|span) style="font-weight[^"]+">([^<]+)</(font|span)>#i', '<strong>$2</strong>', $html);
     $html = preg_replace('#<(font|span) style="font-style:\\s*italic[^"]+">([^<]+)</(font|span)>#i', '<i>$2</i>', $html);
     $html = preg_replace('# (' . $valid . ')=#i', ' |$1|', $html);
     $html = preg_replace('# [a-z]+=["\'][^"\']*["\']#i', '', $html);
     $html = preg_replace('#\\|(' . $valid . ')\\|#i', ' $1=', $html);
     $html = preg_replace('#</?(font|span)[^>]*>#', '', $html);
     $html = preg_replace('#<(/?)div#', '<$1p', $html);
     $html = preg_replace('#<(/?)b>#', '<$1strong>', $html);
     $html = preg_replace('#<br\\s*/?>#', '</p><p>', $html);
     libxml_use_internal_errors(true);
     $DOM = new \DOMDocument();
     $DOM->recover = true;
     $DOM->preserveWhiteSpace = false;
     $DOM->substituteEntities = false;
     $DOM->loadHtml(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'), LIBXML_NOBLANKS | LIBXML_ERR_NONE);
     $DOM->encoding = 'utf-8';
     $html = $DOM->saveHTML();
     libxml_use_internal_errors(false);
     $html = preg_replace('~<(?:!DOCTYPE|/?(?:\\?xml|html|head|body))[^>]*>\\s*~i', '', $html);
     $html = preg_replace('#<([^\\s]+)[^>]*>\\s*</\\1>#', '', $html);
     $html = preg_replace('#</p>\\s+<p#', '</p><p', $html);
     return trim(str_replace('&nbsp;', ' ', $html));
 }
 /**
  * Returns array, containing detailed results for any Google search.
  *
  * @access       private
  * @param        string        $query      String, containing the search query.
  * @param        string        $tld        String, containing the desired Google top level domain.
  * @return       array                     Returns array, containing the keys 'URL', 'Title' and 'Description'.
  */
 public static function googleArray($query)
 {
     $result = array();
     $pages = 1;
     $delay = 0;
     for ($start = 0; $start < $pages; $start++) {
         $url = 'http://www.google.' . GOOGLE_TLD . '/custom?q=' . $query . '&filter=0' . '&num=100' . ($start == 0 ? '' : '&start=' . $start . '00');
         $str = SEOstats::cURL($url);
         if (preg_match("#answer=86640#i", $str)) {
             $e = 'Please read: http://www.google.com/support/websearch/' . 'bin/answer.py?&answer=86640&hl=en';
             throw new SEOstatsException($e);
         } else {
             $html = new DOMDocument();
             @$html->loadHtml($str);
             $xpath = new DOMXPath($html);
             $links = $xpath->query("//div[@class='g']//a");
             $descs = $xpath->query("//td[@class='j']//div[@class='std']");
             $i = 0;
             foreach ($links as $link) {
                 if (!preg_match('#cache#si', $link->textContent) && !preg_match('#similar#si', $link->textContent)) {
                     $result[] = array('url' => $link->getAttribute('href'), 'title' => utf8_decode($link->textContent), 'descr' => utf8_decode($descs->item($i)->textContent));
                     $i++;
                 }
             }
             if (preg_match('#<div id="nn"><\\/div>#i', $str) || preg_match('#<div id=nn><\\/div>#i', $str)) {
                 $pages += 1;
                 $delay += 200000;
                 usleep($delay);
             } else {
                 $pages -= 1;
             }
         }
     }
     return $result;
 }
Example #13
0
 function testRequestToOutputFile()
 {
     $client = new ProxyClient();
     $client->URL = df_absolute_url('tests/test_ProxyClient/test1.html');
     $outputFile = tempnam(sys_get_temp_dir(), 'test_ProxyClient');
     $client->outputFile = $outputFile;
     $client->process();
     $this->assertEquals(null, $client->content, 'Content should be written to output file, not saved to variable.');
     $expected = file_get_contents('tests/test_ProxyClient/test1.html');
     $doc = new DOMDocument();
     @$doc->loadHtml($expected);
     $expected = $doc->saveHtml();
     $actual = file_get_contents($outputFile);
     $actual = '';
     $fh = fopen($outputFile, 'r');
     while (!feof($fh) and trim($line = fgets($fh, 1024))) {
         // We skip the headers
     }
     ob_start();
     fpassthru($fh);
     fclose($fh);
     $actual = ob_get_contents();
     ob_end_clean();
     unset($doc);
     $doc = new DOMDocument();
     @$doc->loadHtml($actual);
     $actual = $doc->saveHtml();
     unset($doc);
     $this->assertEquals($expected, $actual);
 }
Example #14
0
 public function it_should_remove_filtered_file_names()
 {
     $domDoc = new \DOMDocument();
     $domDoc->loadHtml('<html><body><img src="img/spacer.gif" /><img src="img/sprite.png" /><img src="img/cat.jpg" /></body></html>');
     $document = UrlDocument::build($domDoc, 'http://simplegifts.co');
     $analyzer = new StubFileSizeAnalyzer();
     $this->beConstructedThrough('load', [$document, $analyzer]);
     $this->process()->shouldHaveCount(1);
 }
 public function getPageMetrics($url = false)
 {
     $url = false != $url ? $url : self::getUrl();
     $dataUrl = sprintf(services::OPENSITEEXPLORER_URL, 'links', '1', $url);
     $html = HttpRequest::sendRequest($dataUrl);
     $doc = new DOMDocument();
     @$doc->loadHtml($html);
     $data = $doc->getElementsByTagName('td');
     return array('domainAuthority' => trim(strip_tags($data->item(0)->textContent)), 'pageAuthority' => trim(strip_tags($data->item(1)->textContent)), 'linkingRootDomains' => trim(strip_tags($data->item(2)->textContent)), 'totalInboundLinks' => trim(strip_tags($data->item(3)->textContent)));
 }
Example #16
0
 protected function translateHTML($htmlString)
 {
     $dom = new DOMDocument();
     if ($dom) {
         $dom->loadHtml($htmlString);
         $this->translateNodeText($dom);
         $string = $dom->saveHTML();
         $htmlString = mb_substr($string, 119, -15);
     }
     return $htmlString;
 }
Example #17
0
function getWebContent()
{
    $curl = curl_init("http://csgo.99damage.de/de/matches");
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
    $page = curl_exec($curl);
    if (curl_errno($curl)) {
        // check for errors
        echo 'Scraper error: ' . curl_error($curl);
        exit;
    }
    curl_close($curl);
    // Parse the HTML information and return the results.
    $dom = new DOMDocument();
    $dom->loadHtml($page);
    $xpath = new DOMXPath($dom);
    // Get a list of articles from the section page
    $articleList = $xpath->query("//div[@id='content']/a");
    $data = array();
    // Add each article to the Articles array
    foreach ($articleList as $node) {
        $data[] = $node->textContent;
        //$dataHref[] = $node->getAttribute('href');
    }
    //    for($a = 0; $a < count($dataHref);$a++) {
    //        $curlDetail = curl_init($dataHref[$a]);
    //        curl_setopt($curlDetail, CURLOPT_RETURNTRANSFER, TRUE);
    //        $pageDetail = curl_exec($curlDetail);
    //
    //        if (curl_errno($curlDetail)) { // check for errors
    //            echo 'Scraper error: ' . curl_error($curlDetail);
    //            exit;
    //        }
    //        curl_close($curlDetail);
    //
    //        $newdom = new DOMDocument();
    //        $newdom->loadHtml($pageDetail);
    //
    //        $xpathDetail = new DOMXPath($newdom);
    //
    //        $information = array();
    //
    //        // Get a list of articles from the section page
    //        $details = $xpathDetail->query("//div[@class='match_head'] | //div[@class='match_names'] | //div[@class='match_logos']");
    //
    //        foreach ($details as $detail) {
    //            $information[] = $detail->textContent;
    //        }
    //
    //
    //        $result[] = $information;
    //    }
    return $data;
}
Example #18
0
 /**
  * Gets content panel for the Debugbar
  *
  * @return string
  */
 public function getPanel()
 {
     $body = Zend_Controller_Front::getInstance()->getResponse()->getBody();
     $liberrors = libxml_use_internal_errors(true);
     $dom = new DOMDocument();
     $dom->loadHtml($body);
     libxml_use_internal_errors($liberrors);
     $panel = '<h4>HTML Information</h4>';
     $panel .= $this->_isXhtml();
     $linebreak = $this->getLinebreak();
     $panel .= $dom->getElementsByTagName('*')->length . ' Tags in ' . round(strlen($body) / 1024, 2) . 'K' . $linebreak . $dom->getElementsByTagName('link')->length . ' Link Tags' . $linebreak . $dom->getElementsByTagName('script')->length . ' Script Tags' . $linebreak . $dom->getElementsByTagName('img')->length . ' Images' . $linebreak . '<form method="post" action="http://validator.w3.org/check"><p><input type="hidden" name="fragment" value="' . htmlentities($body) . '"' . $this->getClosingBracket() . '<input type="submit" value="Validate With W3C"' . $this->getClosingBracket() . '</p></form>';
     return $panel;
 }
 public function testTitleTransformedWithBold()
 {
     $transformer = new Transformer();
     $json_file = file_get_contents(__DIR__ . '/wp-rules.json');
     $transformer->loadRules($json_file);
     $title_html_string = '<?xml encoding="utf-8" ?><h1>Title <b>in bold</b></h1>';
     libxml_use_internal_errors(true);
     $document = new \DOMDocument();
     $document->loadHtml($title_html_string);
     libxml_use_internal_errors(false);
     $header = Header::create();
     $transformer->transform($header, $document);
     $this->assertEquals('<h1>Title <b>in bold</b></h1>', $header->getTitle()->render());
 }
Example #20
0
 /**
  * Get an Excerpt array from a chunk of HTML
  *
  * @param $html         Chunk of HTML
  * @param $tag          a tag, for example `img`
  * @return array|null   returns nested array excerpt
  */
 public static function getExcerptFromHtml($html, $tag)
 {
     $doc = new \DOMDocument();
     $doc->loadHtml($html);
     $images = $doc->getElementsByTagName($tag);
     $excerpt = null;
     foreach ($images as $image) {
         $attributes = [];
         foreach ($image->attributes as $name => $value) {
             $attributes[$name] = $value->value;
         }
         $excerpt = ['element' => ['name' => $image->tagName, 'attributes' => $attributes]];
     }
     return $excerpt;
 }
Example #21
0
 /**
  * 截取导语
  * 
  * @param string $string 字符串
  * @param int    $width  截取宽度
  * @param string $dot    如果被截取,显示最后的内容
  * 
  * @return string
  */
 public static function truncateSummary($string, $width, $dot = '…')
 {
     $dom = new \DOMDocument();
     $dom->loadHtml('<!DOCTYPE html><html><head><meta charset="utf-8"></head><body>' . $string . '</body></html>');
     $body = $dom->getElementsByTagName('body');
     $will_remove_nodes = self::_truncateSummaryDom($body->item(0)->childNodes, $width);
     foreach ($will_remove_nodes as $node) {
         $node->parentNode->removeChild($node);
     }
     //生成数据
     $dom_result = $dom->saveHTML();
     preg_match('#<body>(.*)</body>#is', $dom_result, $result);
     $result = isset($result[1]) ? trim($result[1]) : $string;
     $will_remove_nodes && ($result .= '...');
     return $result;
 }
Example #22
0
 public function testMenuItemCollection()
 {
     $menu = new MenuItemCollection();
     $item1 = $menu->appendLink('Car', ['href' => '/products/car']);
     ok($item1);
     $folder = $menu->appendFolder('Others');
     ok($folder);
     $folder->appendLink('A', ['href' => '/products/a']);
     $folder->appendLink('B', ['href' => '/products/b']);
     $html = $menu->render();
     $dom = new DOMDocument('1.0');
     $dom->preserveWhiteSpace = false;
     $dom->formatOutput = true;
     $dom->loadHtml($html);
     echo $dom->saveHTML();
 }
Example #23
0
 /**
  * parse
  *
  * Extract the called selenium fonction from the html suite
  */
 public function parse()
 {
     $parsedTab = array();
     $key1 = 0;
     $contenthtml = new DOMDocument();
     @$contenthtml->loadHtml($this->html);
     $content = simplexml_import_dom($contenthtml);
     foreach ($content->body->table->tbody->tr as $tr) {
         $key2 = 0;
         foreach ($tr->td as $td) {
             $parsedTab[$key1][$key2] = $td;
             $key2++;
         }
         $key1++;
     }
     $this->parsed_table = $parsedTab;
 }
Example #24
0
 public function createFromHtml($html, $charset = null, $charset_hint = null, $format = true)
 {
     if ($format) {
         $html = $this->formatHtml($html, $charset, $charset_hint);
     }
     if (!$html) {
         $this->error = self::ERROR_TYPE_ENCODING;
         $doc = false;
     } else {
         $doc = new DOMDocument("1.0", "utf-8");
         if (@$doc->loadHtml($html)) {
             $this->error = false;
         } else {
             $this->error = self::ERROR_TYPE_DOM_PARSING;
             $doc = false;
         }
     }
     return $doc;
 }
Example #25
0
function fof_item_targets($content)
{
    /* quiet warnings */
    $old_xml_err = libxml_use_internal_errors(true);
    $dom = new DOMDocument();
    /*
    	Load content into DOM, within a div wrapper.  Wrapper div will be
    	stripped before returning altered content.  Without doing this,
    	any bare text content would get wrapped in p elements while being
    	parsed in.
    */
    $dom->loadHtml('<div>' . mb_convert_encoding($content, 'HTML-ENTITIES', "UTF-8") . '</div>');
    /* strip <!DOCTYPE> which DOMDocument adds */
    $dom->removeChild($dom->firstChild);
    /* strip <html><body> which DOMDocument adds */
    $dom->replaceChild($dom->firstChild->firstChild->firstChild, $dom->firstChild);
    /* replace or add link targets */
    $xpath = new DOMXpath($dom);
    foreach ($xpath->query('//a') as $node) {
        $node->setAttribute('target', '_blank');
    }
    $content_out = '';
    /* emit the updated contents inside our div */
    /* start at the first node inside first div.. */
    $node = $dom->firstChild->firstChild;
    while ($node) {
        $content_out .= $dom->saveHTML($node);
        /* repeat for all nodes at this level */
        $node = $node->nextSibling;
    }
    foreach (libxml_get_errors() as $error) {
        /* just ignore warnings */
        if ($error->level === LIBXML_ERR_WARNING) {
            continue;
        }
        fof_log(__FUNCTION__ . ': ' . $error->message);
    }
    libxml_clear_errors();
    libxml_use_internal_errors($old_xml_err);
    return $content_out;
}
 public function getData($_Data)
 {
     $html = file_get_contents($_Data);
     $xml = new DOMDocument();
     $xml->loadHtml($html);
     $xpath = new DOMXPath($xml);
     $response = 'NOT';
     $html = '';
     $_Precise = array();
     $results = $xpath->query("//*[@class='wrap al_border attraction_element']");
     $result_length = $results->length;
     if ($result_length > 0) {
         foreach ($results as $container) {
             $_Titles = $container->getElementsByTagName("a");
             $_Ratings = $container->getElementsByTagName("img");
             $_Reviews = $container->getElementsByTagName("span");
             $_Tags = $container->getElementsByTagName("span");
             foreach ($_Titles as $_Title) {
                 foreach ($_Ratings as $_Rating) {
                     foreach ($_Reviews as $_Review) {
                         foreach ($_Tags as $_Tag) {
                             if ($_Title->parentNode->getAttribute('class') == "property_title" && $_Rating->parentNode->parentNode->getAttribute('class') == "rs rating" && $_Review->getAttribute('class') == "more" && $_Tag->parentNode->getAttribute('class') == "p13n_reasoning_v2") {
                                 $_Title_Text = trim(preg_replace("/[\r\n]+/", " ", $_Title->nodeValue));
                                 $_Rating_Text = trim(preg_replace("/[\r\n]+/", " ", $_Rating->getAttribute("alt")));
                                 $_Review_Text = trim(preg_replace("/[\r\n]+/", " ", $_Review->nodeValue));
                                 $_Tag_Text = trim(preg_replace("/[\r\n]+/", " ", $_Tag->nodeValue));
                                 $_Rating_Digit = explode(" ", $_Rating_Text);
                                 $links[] = array('Title' => $_Title_Text, 'Rating' => $_Rating_Digit[0], 'Review' => $_Review_Text, 'Tag' => $_Tag_Text);
                             }
                         }
                         //Tags Close
                     }
                     //Reviews Close
                 }
                 //Ratings Close
             }
             //Titles Close
         }
     }
     return json_encode($links);
 }
 /**
  * @return string
  */
 public function getGroceryData()
 {
     $html = curl_exec($this->ch);
     $this->chackNotEmpty($html, $this->ch);
     $dom = new DOMDocument();
     @$dom->loadHtml($html);
     $xpath = new DOMXPath($dom);
     $products = $xpath->query("//div[@class='product ']");
     $total = 0;
     $results = array();
     foreach ($products as $product) {
         // Get Product anchor
         $a = $xpath->query("div//a", $product);
         $node = $a->item(0)->nodeValue;
         $text = trim(preg_replace("/[\r\n]+/", " ", $node));
         $link = trim(preg_replace("/[\r\n]+/", " ", $a->item(0)->getAttribute("href")));
         curl_setopt($this->ch, CURLOPT_URL, $link);
         $linked_html = curl_exec($this->ch);
         $this->chackNotEmpty($linked_html, $this->ch);
         // Get the size in kb of the linked HTML without assets
         $sizeBites = strlen($linked_html);
         $size = $sizeBites > 1024 ? round($sizeBites / 1024, 2) . "kb" : $sizeBites . "b";
         // Load linked HTML
         $dom_linked_html = new DOMDocument();
         @$dom_linked_html->loadHtml($linked_html);
         $xpath_linked_html = new DOMXPath($dom_linked_html);
         $node_description = $xpath_linked_html->query("//div[@class='productText']//p");
         // Get Product Description
         $node_description = $xpath_linked_html->query("//div[contains(@class, 'Text')]");
         $description = trim(preg_replace("/[\r\n]+/", " ", $node_description->item(0)->nodeValue));
         // Get price unit HTML
         $p = $xpath->query('div//p[@class="pricePerUnit"]', $product);
         $price = substr(trim(preg_replace("/[\r\n]+/", " ", $p->item(0)->nodeValue)), 2, 4);
         $total += $price;
         // Get the final array
         $results[] = array('title' => $text, 'size' => $size, 'unit_price' => $price, 'description' => $description);
     }
     // Generate JSON
     return json_encode(array('results' => $results, 'total' => $total), JSON_PRETTY_PRINT);
 }
Example #28
0
 public function scrape($input)
 {
     // Suppress warnings relating to XML markup
     libxml_use_internal_errors(true);
     libxml_clear_errors();
     $doc = new DOMDocument();
     $doc->loadHtml($input);
     $xpath = new DOMXPath($doc);
     $nodes = $xpath->query("//div[contains(concat(' ',@class,' '),' concept_light ')]");
     foreach ($nodes as $i => $result) {
         // Get Japanese Word
         $readings = $xpath->query("div/div/div[contains(concat(' ',@class,' '),' concept_light-representation ')]/span[contains(concat(' ',@class,' '),' text ')]", $result);
         foreach ($readings as $word) {
             $alfred_results[$i]['ja'] = trim($word->nodeValue);
         }
         // Get result details
         $definitions = $xpath->query("div[contains(concat(' ',@class,' '),' concept_light-meanings ')]/div[contains(concat(' ',@class,' '),' meanings-wrapper ')]", $result);
         foreach ($definitions as $j => $definition) {
             // Get type (verb, noun, etc)
             $types = $xpath->query("div[contains(concat(' ',@class,' '),' meaning-tags ')]", $definition);
             $type_arr = array();
             foreach ($types as $type) {
                 $alfred_results[$i]['type'] = $this->getType(trim(strtolower($type->nodeValue)));
                 break;
             }
             // Get definitions
             $words = $xpath->query("div[contains(concat(' ',@class,' '),' meaning-wrapper ')]/div[contains(concat(' ',@class,' '),' meaning-definition ')]/span[contains(concat(' ',@class,' '),' meaning-meaning ')]", $definition);
             foreach ($words as $z => $word) {
                 $alfred_results[$i]['en'][] = trim($word->nodeValue);
             }
         }
         // Get Link
         $anchors = $xpath->query("a[contains(concat(' ',@class,' '),' light-details_link ')]", $result);
         foreach ($anchors as $anchor) {
             $alfred_results[$i]['url'] = $anchor->getAttribute("href");
         }
     }
     return $alfred_results;
 }
Example #29
0
 function parseEntries($input, array &$resultArray)
 {
     $wordDoc = new DOMDocument();
     $wordDoc->loadHtml($input);
     $wordXPath = new DOMXPath($wordDoc);
     $elements = $wordXPath->query("//table/*/tr/td[@class='text']");
     $output = "";
     $i = 1;
     $resultEntry = new ParserResult();
     foreach ($elements as $element) {
         $subElements = $element->getElementsByTagName("small");
         do {
             $moreTags = $this->removeSmallTags($element);
         } while ($moreTags->length != 0);
         $this->removeSmallTags($element);
         $isSearchWord = $this->isSearchWord($element);
         $languageCode = $this->getLanguageCode($element);
         $value = utf8_decode(trim($element->nodeValue));
         if ($i % 2 != 0) {
             if ($isSearchWord) {
                 $resultEntry->originalWord = $value;
             } else {
                 $resultEntry->languageCode = $languageCode;
                 $resultEntry->translatedWord = $value;
             }
         } else {
             if ($resultEntry->languageCode == "") {
                 $resultEntry->languageCode = $languageCode;
                 $resultEntry->translatedWord = $value;
             } else {
                 $resultEntry->originalWord = $value;
             }
             array_push($resultArray, $resultEntry);
             $resultEntry = new ParserResult();
         }
         $i++;
     }
 }
function get_ris_sessions($year, $month)
{
    $html = get_ris_html($year, $month);
    if ($html === false) {
        echo 'connection error';
    } else {
        $oldSetting = libxml_use_internal_errors(true);
        libxml_clear_errors();
        $dom = new DOMDocument();
        $dom->loadHtml($html);
        $tbody = $dom->getElementsByTagName('tbody');
        $trs = $tbody[0]->getElementsByTagName('tr');
        $sessions = array();
        $lastdate = 0;
        foreach ($trs as $tr) {
            $tds = $tr->getElementsByTagName('td');
            // Datum ermitteln
            // in leeren Zeilen zählt das Datum das weiter oben in dieser Spalte schon angezeigt wurde
            $day = intval(trim($tds[3]->textContent));
            if ($day > 0) {
                $lastdate = sprintf("%'.02d", $day);
            }
            // Dokumenten links ermitteln
            $links = get_links_from_td($tds[8]);
            if (count($links) > 0) {
                $sessions[] = array('datum' => $lastdate . '.' . $month . '.' . $year, 'Sitzung' => $tds[5]->nodeValue, 'links' => $links);
            }
        }
        libxml_clear_errors();
        libxml_use_internal_errors($oldSetting);
        if (count($sessions) > 0) {
            return $sessions;
        } else {
            return array();
        }
    }
    return false;
}