function _testXPath($xpath_expression) { if (!class_exists('DOMDocument') || !class_exists('DOMXPath')) { if (function_exists('domxml_open_mem')) { $dom = domxml_open_mem($this->_response); if (!$dom) { $this->fail('Error parsing doc'); return false; } var_dump($dom); $xpath = $dom->xpath_init(); var_dump($xpath); $ctx = $dom->xpath_new_context(); var_dump($xpath_expression); $result = $ctx->xpath_eval($xpath_expression); var_dump($result); $return = new stdClass(); $return->length = count($result->nodeset); return $return; } $this->fail('No xpath support built in'); return false; } else { if (extension_loaded('domxml')) { $this->fail('Please disable the domxml extension. Only php5 builtin domxml is supported'); return false; } } $dom = new DOMDocument(); $dom->loadHtml($this->_response); $xpath = new DOMXPath($dom); $node = $xpath->query($xpath_expression); return $node; }
protected function __construct($url, $status, array $headers, $body) { parent::__construct($url, $status, $headers, $body); $this->_domDocument = new DOMDocument(); $this->_domDocument->preserveWhiteSpace = true; // We have to silence this out because invalid documents // tend to throw allot of warnings @$this->_domDocument->loadHtml($body); }
/** * @param string $html * @return \DiDom\Element * @throws \InvalidArgumentException */ public function loadHtml($html) { if (!is_string($html)) { throw new InvalidArgumentException(sprintf('%s expects parameter 1 to be string, %s given', __METHOD__, is_object($html) ? get_class($html) : gettype($html))); } libxml_use_internal_errors(true); libxml_disable_entity_loader(true); $this->document->loadHtml($html); libxml_clear_errors(); libxml_disable_entity_loader(false); libxml_use_internal_errors(false); return $this; }
function get_members($url) { $html = get_html($url); if ($html === false) { echo 'connection error'; } else { $oldSetting = libxml_use_internal_errors(true); libxml_clear_errors(); $dom = new DOMDocument(); $dom->loadHtml($html); $tbody = $dom->getElementsByTagName('tbody'); $trs = $tbody[0]->getElementsByTagName('tr'); global $parteinameFilter; $members = array(); foreach ($trs as $tr) { $tds = $tr->getElementsByTagName('td'); $link = $tds[0]->getElementsByTagName('a'); $member = array('name' => $link[0]->nodeValue, 'link' => $link[0]->getAttribute('href'), 'partei' => str_replace($parteinameFilter, '', $tds[1]->nodeValue)); $aze = str_replace(' ', '', htmlentities($tds[2]->nodeValue)); if ($aze) { $member['amtszeitende'] = $aze; } $members[] = $member; } libxml_clear_errors(); libxml_use_internal_errors($oldSetting); return $members; } return false; }
function get_commitees($url) { $html = get_html($url); if ($html === false) { echo 'connection error'; } else { $oldSetting = libxml_use_internal_errors(true); libxml_clear_errors(); $dom = new DOMDocument(); $dom->loadHtml($html); $tbody = $dom->getElementsByTagName('tbody'); $trs = $tbody[0]->getElementsByTagName('tr'); $commitees = array(); foreach ($trs as $tr) { $tds = $tr->getElementsByTagName('td'); $link = $tds[0]->getElementsByTagName('a'); if ($link->length > 0) { $commitee = array('name' => $link[0]->nodeValue, 'link' => $link[0]->getAttribute('href')); } $commitees[] = $commitee; } libxml_clear_errors(); libxml_use_internal_errors($oldSetting); return $commitees; } return false; }
static function fixChildrenAttribute($elementType, $name, $value) { var_dump('AbstractElement\\Helper::fixChildrenAttribute needs fixed'); exit; $classPath = '\\AbstractElement\\' . $elementType; // each element in contents array for this object foreach ($this->contents as $index => $content) { // is this an object that extends AbstractElement? if (is_a($content, '\\Element')) { // is this of the right element type? if (is_a($content, $classPath)) { $content->setAttribute($name, $value); } $content->fixChildrenAttribute($elementType, $name, $value); } elseif (is_string($content) && $fixRawHtml) { $dom = new \DOMDocument(); $dom->loadHtml($content); $reflectionClass = new \ReflectionClass($classPath); $elements = $dom->getElementsByTagName($reflectionClass->getConstant('tag')); foreach ($elements as $element) { $element->setAttribute($name, $value); } $this->contents[$index] = $dom->saveHTML(); } } }
public static function uploadTextarea($texto, $tipo_midia) { $nomeTipo = TipoMidia::findOrFail($tipo_midia)->descricao; // gravando imagem do corpo da noticia $dom = new \DOMDocument(); $dom->loadHtml($texto, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); $images = $dom->getElementsByTagName('img'); // foreach <img> in the submited message foreach ($images as $img) { $src = $img->getAttribute('src'); // if the img source is 'data-url' if (preg_match('/data:image/', $src)) { // get the mimetype preg_match('/data:image\\/(?<mime>.*?)\\;/', $src, $groups); $mimetype = $groups['mime']; // Generating a random filename $filename = md5(uniqid()); $filepath = "uploads/" . $nomeTipo . "/" . $filename . '.' . $mimetype; // @see http://image.intervention.io/api/ $image = Image::make($src)->encode($mimetype, 100)->save(public_path($filepath)); $new_src = asset($filepath); $img->removeAttribute('src'); $img->setAttribute('src', $new_src); } } return $dom->saveHTML(); }
function remove_link_tags($content) { $old_xml_err = libxml_use_internal_errors(true); $dom = new DOMDocument(); $dom->loadHtml(mb_convert_encoding($content, 'HTML-ENTITIES', "UTF-8")); foreach ($dom->getElementsByTagName('link') as $link) { $link->parentNode->removeChild($link); } $content_out = ''; $node = $dom->firstChild; while ($node) { $content_out .= $dom->saveHTML($node); /* repeat for all nodes at this level */ $node = $node->nextSibling; } foreach (libxml_get_errors() as $error) { /* just ignore warnings */ if ($error->level === LIBXML_ERR_WARNING) { continue; } fof_log(__FUNCTION__ . ': ' . $error->message); } libxml_clear_errors(); libxml_use_internal_errors($old_xml_err); return $content_out; }
/** * Fetch conversation with * * @access public * @var string $gamertag * @var string $region * @var string $sender * @return array */ public function fetchConversationWith($gamertag, $region, $sender) { $gamertag = trim($gamertag); $url = 'https://account.xbox.com/' . $region . '/Messages/UserConversation?senderGamerTag=' . $sender; $key = $this->version . ':getMessages.' . $gamertag; $data = $this->fetch_url($url); $doc = new DOMDocument(); if (!empty($sender) && !empty($gamertag)) { $doc->loadHtml($data); $xpath = new DOMXPath($doc); $postThumbLinks = $xpath->query("//div[@class='messageContent']"); $i = 0; $array = array(); $last_sender = ""; foreach ($postThumbLinks as $link) { $body = $this->find($link->ownerDocument->saveHTML($link), '<div class="messageBody">', '</div>'); $time = $this->find($link->ownerDocument->saveHTML($link), '<div class="sentDate localTime">', '</div>'); $sender = $this->find($link->ownerDocument->saveHTML($link), '<div class="senderGamertag">', '</div>'); $array[$i]['message'] = $body; $array[$i]['time'] = $time; if ($sender) { $array[$i]['sender'] = $sender; $last_sender = $sender; } else { $array[$i]['sender'] = $last_sender; } $i++; } } else { return false; } return $array; }
/** * Load HTML or XML. * * @param string $string HTML or XML string or file path * @param bool $isFile Indicates that in first parameter was passed to the file path * @param string $type Type of document * @param int $options Additional parameters */ public function load($string, $isFile = false, $type = 'html', $options = 0) { if (!is_string($string)) { throw new InvalidArgumentException(sprintf('%s expects parameter 1 to be string, %s given', __METHOD__, is_object($string) ? get_class($string) : gettype($string))); } if (!in_array(strtolower($type), ['xml', 'html'])) { throw new InvalidArgumentException(sprintf('Document type must be "xml" or "html", %s given', __METHOD__, is_object($type) ? get_class($type) : gettype($type))); } if (!is_integer($options)) { throw new InvalidArgumentException(sprintf('%s expects parameter 4 to be integer, %s given', __METHOD__, is_object($options) ? get_class($options) : gettype($options))); } $string = trim($string); if ($isFile) { $string = $this->loadFile($string); } if (substr($string, 0, 5) !== '<?xml') { $prolog = sprintf('<?xml version="1.0" encoding="%s"?>', $this->document->encoding); $string = $prolog . $string; } $this->type = strtolower($type); Errors::disable(); $this->type === 'xml' ? $this->document->loadXml($string, $options) : $this->document->loadHtml($string, $options); Errors::restore(); return $this; }
public static function fix($html) { if (empty($html)) { return $html; } $html = self::xss($html); if (substr($html, 0, 2) !== '<p') { $html = '<p>' . implode('</p><p>', preg_split('/[\\n\\r]/', $html)) . '</p>'; } $html = trim(str_replace(["\n", "\r"], ' ', self::xss($html))); $html = preg_replace('#\\s{2,}#', ' ', $html); $valid = 'class|src|target|alt|title|href|rel'; $html = preg_replace('#<(font|span) style="font-weight[^"]+">([^<]+)</(font|span)>#i', '<strong>$2</strong>', $html); $html = preg_replace('#<(font|span) style="font-style:\\s*italic[^"]+">([^<]+)</(font|span)>#i', '<i>$2</i>', $html); $html = preg_replace('# (' . $valid . ')=#i', ' |$1|', $html); $html = preg_replace('# [a-z]+=["\'][^"\']*["\']#i', '', $html); $html = preg_replace('#\\|(' . $valid . ')\\|#i', ' $1=', $html); $html = preg_replace('#</?(font|span)[^>]*>#', '', $html); $html = preg_replace('#<(/?)div#', '<$1p', $html); $html = preg_replace('#<(/?)b>#', '<$1strong>', $html); $html = preg_replace('#<br\\s*/?>#', '</p><p>', $html); libxml_use_internal_errors(true); $DOM = new \DOMDocument(); $DOM->recover = true; $DOM->preserveWhiteSpace = false; $DOM->substituteEntities = false; $DOM->loadHtml(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'), LIBXML_NOBLANKS | LIBXML_ERR_NONE); $DOM->encoding = 'utf-8'; $html = $DOM->saveHTML(); libxml_use_internal_errors(false); $html = preg_replace('~<(?:!DOCTYPE|/?(?:\\?xml|html|head|body))[^>]*>\\s*~i', '', $html); $html = preg_replace('#<([^\\s]+)[^>]*>\\s*</\\1>#', '', $html); $html = preg_replace('#</p>\\s+<p#', '</p><p', $html); return trim(str_replace(' ', ' ', $html)); }
/** * Returns array, containing detailed results for any Google search. * * @access private * @param string $query String, containing the search query. * @param string $tld String, containing the desired Google top level domain. * @return array Returns array, containing the keys 'URL', 'Title' and 'Description'. */ public static function googleArray($query) { $result = array(); $pages = 1; $delay = 0; for ($start = 0; $start < $pages; $start++) { $url = 'http://www.google.' . GOOGLE_TLD . '/custom?q=' . $query . '&filter=0' . '&num=100' . ($start == 0 ? '' : '&start=' . $start . '00'); $str = SEOstats::cURL($url); if (preg_match("#answer=86640#i", $str)) { $e = 'Please read: http://www.google.com/support/websearch/' . 'bin/answer.py?&answer=86640&hl=en'; throw new SEOstatsException($e); } else { $html = new DOMDocument(); @$html->loadHtml($str); $xpath = new DOMXPath($html); $links = $xpath->query("//div[@class='g']//a"); $descs = $xpath->query("//td[@class='j']//div[@class='std']"); $i = 0; foreach ($links as $link) { if (!preg_match('#cache#si', $link->textContent) && !preg_match('#similar#si', $link->textContent)) { $result[] = array('url' => $link->getAttribute('href'), 'title' => utf8_decode($link->textContent), 'descr' => utf8_decode($descs->item($i)->textContent)); $i++; } } if (preg_match('#<div id="nn"><\\/div>#i', $str) || preg_match('#<div id=nn><\\/div>#i', $str)) { $pages += 1; $delay += 200000; usleep($delay); } else { $pages -= 1; } } } return $result; }
function testRequestToOutputFile() { $client = new ProxyClient(); $client->URL = df_absolute_url('tests/test_ProxyClient/test1.html'); $outputFile = tempnam(sys_get_temp_dir(), 'test_ProxyClient'); $client->outputFile = $outputFile; $client->process(); $this->assertEquals(null, $client->content, 'Content should be written to output file, not saved to variable.'); $expected = file_get_contents('tests/test_ProxyClient/test1.html'); $doc = new DOMDocument(); @$doc->loadHtml($expected); $expected = $doc->saveHtml(); $actual = file_get_contents($outputFile); $actual = ''; $fh = fopen($outputFile, 'r'); while (!feof($fh) and trim($line = fgets($fh, 1024))) { // We skip the headers } ob_start(); fpassthru($fh); fclose($fh); $actual = ob_get_contents(); ob_end_clean(); unset($doc); $doc = new DOMDocument(); @$doc->loadHtml($actual); $actual = $doc->saveHtml(); unset($doc); $this->assertEquals($expected, $actual); }
public function it_should_remove_filtered_file_names() { $domDoc = new \DOMDocument(); $domDoc->loadHtml('<html><body><img src="img/spacer.gif" /><img src="img/sprite.png" /><img src="img/cat.jpg" /></body></html>'); $document = UrlDocument::build($domDoc, 'http://simplegifts.co'); $analyzer = new StubFileSizeAnalyzer(); $this->beConstructedThrough('load', [$document, $analyzer]); $this->process()->shouldHaveCount(1); }
public function getPageMetrics($url = false) { $url = false != $url ? $url : self::getUrl(); $dataUrl = sprintf(services::OPENSITEEXPLORER_URL, 'links', '1', $url); $html = HttpRequest::sendRequest($dataUrl); $doc = new DOMDocument(); @$doc->loadHtml($html); $data = $doc->getElementsByTagName('td'); return array('domainAuthority' => trim(strip_tags($data->item(0)->textContent)), 'pageAuthority' => trim(strip_tags($data->item(1)->textContent)), 'linkingRootDomains' => trim(strip_tags($data->item(2)->textContent)), 'totalInboundLinks' => trim(strip_tags($data->item(3)->textContent))); }
protected function translateHTML($htmlString) { $dom = new DOMDocument(); if ($dom) { $dom->loadHtml($htmlString); $this->translateNodeText($dom); $string = $dom->saveHTML(); $htmlString = mb_substr($string, 119, -15); } return $htmlString; }
function getWebContent() { $curl = curl_init("http://csgo.99damage.de/de/matches"); curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE); $page = curl_exec($curl); if (curl_errno($curl)) { // check for errors echo 'Scraper error: ' . curl_error($curl); exit; } curl_close($curl); // Parse the HTML information and return the results. $dom = new DOMDocument(); $dom->loadHtml($page); $xpath = new DOMXPath($dom); // Get a list of articles from the section page $articleList = $xpath->query("//div[@id='content']/a"); $data = array(); // Add each article to the Articles array foreach ($articleList as $node) { $data[] = $node->textContent; //$dataHref[] = $node->getAttribute('href'); } // for($a = 0; $a < count($dataHref);$a++) { // $curlDetail = curl_init($dataHref[$a]); // curl_setopt($curlDetail, CURLOPT_RETURNTRANSFER, TRUE); // $pageDetail = curl_exec($curlDetail); // // if (curl_errno($curlDetail)) { // check for errors // echo 'Scraper error: ' . curl_error($curlDetail); // exit; // } // curl_close($curlDetail); // // $newdom = new DOMDocument(); // $newdom->loadHtml($pageDetail); // // $xpathDetail = new DOMXPath($newdom); // // $information = array(); // // // Get a list of articles from the section page // $details = $xpathDetail->query("//div[@class='match_head'] | //div[@class='match_names'] | //div[@class='match_logos']"); // // foreach ($details as $detail) { // $information[] = $detail->textContent; // } // // // $result[] = $information; // } return $data; }
/** * Gets content panel for the Debugbar * * @return string */ public function getPanel() { $body = Zend_Controller_Front::getInstance()->getResponse()->getBody(); $liberrors = libxml_use_internal_errors(true); $dom = new DOMDocument(); $dom->loadHtml($body); libxml_use_internal_errors($liberrors); $panel = '<h4>HTML Information</h4>'; $panel .= $this->_isXhtml(); $linebreak = $this->getLinebreak(); $panel .= $dom->getElementsByTagName('*')->length . ' Tags in ' . round(strlen($body) / 1024, 2) . 'K' . $linebreak . $dom->getElementsByTagName('link')->length . ' Link Tags' . $linebreak . $dom->getElementsByTagName('script')->length . ' Script Tags' . $linebreak . $dom->getElementsByTagName('img')->length . ' Images' . $linebreak . '<form method="post" action="http://validator.w3.org/check"><p><input type="hidden" name="fragment" value="' . htmlentities($body) . '"' . $this->getClosingBracket() . '<input type="submit" value="Validate With W3C"' . $this->getClosingBracket() . '</p></form>'; return $panel; }
public function testTitleTransformedWithBold() { $transformer = new Transformer(); $json_file = file_get_contents(__DIR__ . '/wp-rules.json'); $transformer->loadRules($json_file); $title_html_string = '<?xml encoding="utf-8" ?><h1>Title <b>in bold</b></h1>'; libxml_use_internal_errors(true); $document = new \DOMDocument(); $document->loadHtml($title_html_string); libxml_use_internal_errors(false); $header = Header::create(); $transformer->transform($header, $document); $this->assertEquals('<h1>Title <b>in bold</b></h1>', $header->getTitle()->render()); }
/** * Get an Excerpt array from a chunk of HTML * * @param $html Chunk of HTML * @param $tag a tag, for example `img` * @return array|null returns nested array excerpt */ public static function getExcerptFromHtml($html, $tag) { $doc = new \DOMDocument(); $doc->loadHtml($html); $images = $doc->getElementsByTagName($tag); $excerpt = null; foreach ($images as $image) { $attributes = []; foreach ($image->attributes as $name => $value) { $attributes[$name] = $value->value; } $excerpt = ['element' => ['name' => $image->tagName, 'attributes' => $attributes]]; } return $excerpt; }
/** * 截取导语 * * @param string $string 字符串 * @param int $width 截取宽度 * @param string $dot 如果被截取,显示最后的内容 * * @return string */ public static function truncateSummary($string, $width, $dot = '…') { $dom = new \DOMDocument(); $dom->loadHtml('<!DOCTYPE html><html><head><meta charset="utf-8"></head><body>' . $string . '</body></html>'); $body = $dom->getElementsByTagName('body'); $will_remove_nodes = self::_truncateSummaryDom($body->item(0)->childNodes, $width); foreach ($will_remove_nodes as $node) { $node->parentNode->removeChild($node); } //生成数据 $dom_result = $dom->saveHTML(); preg_match('#<body>(.*)</body>#is', $dom_result, $result); $result = isset($result[1]) ? trim($result[1]) : $string; $will_remove_nodes && ($result .= '...'); return $result; }
public function testMenuItemCollection() { $menu = new MenuItemCollection(); $item1 = $menu->appendLink('Car', ['href' => '/products/car']); ok($item1); $folder = $menu->appendFolder('Others'); ok($folder); $folder->appendLink('A', ['href' => '/products/a']); $folder->appendLink('B', ['href' => '/products/b']); $html = $menu->render(); $dom = new DOMDocument('1.0'); $dom->preserveWhiteSpace = false; $dom->formatOutput = true; $dom->loadHtml($html); echo $dom->saveHTML(); }
/** * parse * * Extract the called selenium fonction from the html suite */ public function parse() { $parsedTab = array(); $key1 = 0; $contenthtml = new DOMDocument(); @$contenthtml->loadHtml($this->html); $content = simplexml_import_dom($contenthtml); foreach ($content->body->table->tbody->tr as $tr) { $key2 = 0; foreach ($tr->td as $td) { $parsedTab[$key1][$key2] = $td; $key2++; } $key1++; } $this->parsed_table = $parsedTab; }
public function createFromHtml($html, $charset = null, $charset_hint = null, $format = true) { if ($format) { $html = $this->formatHtml($html, $charset, $charset_hint); } if (!$html) { $this->error = self::ERROR_TYPE_ENCODING; $doc = false; } else { $doc = new DOMDocument("1.0", "utf-8"); if (@$doc->loadHtml($html)) { $this->error = false; } else { $this->error = self::ERROR_TYPE_DOM_PARSING; $doc = false; } } return $doc; }
function fof_item_targets($content) { /* quiet warnings */ $old_xml_err = libxml_use_internal_errors(true); $dom = new DOMDocument(); /* Load content into DOM, within a div wrapper. Wrapper div will be stripped before returning altered content. Without doing this, any bare text content would get wrapped in p elements while being parsed in. */ $dom->loadHtml('<div>' . mb_convert_encoding($content, 'HTML-ENTITIES', "UTF-8") . '</div>'); /* strip <!DOCTYPE> which DOMDocument adds */ $dom->removeChild($dom->firstChild); /* strip <html><body> which DOMDocument adds */ $dom->replaceChild($dom->firstChild->firstChild->firstChild, $dom->firstChild); /* replace or add link targets */ $xpath = new DOMXpath($dom); foreach ($xpath->query('//a') as $node) { $node->setAttribute('target', '_blank'); } $content_out = ''; /* emit the updated contents inside our div */ /* start at the first node inside first div.. */ $node = $dom->firstChild->firstChild; while ($node) { $content_out .= $dom->saveHTML($node); /* repeat for all nodes at this level */ $node = $node->nextSibling; } foreach (libxml_get_errors() as $error) { /* just ignore warnings */ if ($error->level === LIBXML_ERR_WARNING) { continue; } fof_log(__FUNCTION__ . ': ' . $error->message); } libxml_clear_errors(); libxml_use_internal_errors($old_xml_err); return $content_out; }
public function getData($_Data) { $html = file_get_contents($_Data); $xml = new DOMDocument(); $xml->loadHtml($html); $xpath = new DOMXPath($xml); $response = 'NOT'; $html = ''; $_Precise = array(); $results = $xpath->query("//*[@class='wrap al_border attraction_element']"); $result_length = $results->length; if ($result_length > 0) { foreach ($results as $container) { $_Titles = $container->getElementsByTagName("a"); $_Ratings = $container->getElementsByTagName("img"); $_Reviews = $container->getElementsByTagName("span"); $_Tags = $container->getElementsByTagName("span"); foreach ($_Titles as $_Title) { foreach ($_Ratings as $_Rating) { foreach ($_Reviews as $_Review) { foreach ($_Tags as $_Tag) { if ($_Title->parentNode->getAttribute('class') == "property_title" && $_Rating->parentNode->parentNode->getAttribute('class') == "rs rating" && $_Review->getAttribute('class') == "more" && $_Tag->parentNode->getAttribute('class') == "p13n_reasoning_v2") { $_Title_Text = trim(preg_replace("/[\r\n]+/", " ", $_Title->nodeValue)); $_Rating_Text = trim(preg_replace("/[\r\n]+/", " ", $_Rating->getAttribute("alt"))); $_Review_Text = trim(preg_replace("/[\r\n]+/", " ", $_Review->nodeValue)); $_Tag_Text = trim(preg_replace("/[\r\n]+/", " ", $_Tag->nodeValue)); $_Rating_Digit = explode(" ", $_Rating_Text); $links[] = array('Title' => $_Title_Text, 'Rating' => $_Rating_Digit[0], 'Review' => $_Review_Text, 'Tag' => $_Tag_Text); } } //Tags Close } //Reviews Close } //Ratings Close } //Titles Close } } return json_encode($links); }
/** * @return string */ public function getGroceryData() { $html = curl_exec($this->ch); $this->chackNotEmpty($html, $this->ch); $dom = new DOMDocument(); @$dom->loadHtml($html); $xpath = new DOMXPath($dom); $products = $xpath->query("//div[@class='product ']"); $total = 0; $results = array(); foreach ($products as $product) { // Get Product anchor $a = $xpath->query("div//a", $product); $node = $a->item(0)->nodeValue; $text = trim(preg_replace("/[\r\n]+/", " ", $node)); $link = trim(preg_replace("/[\r\n]+/", " ", $a->item(0)->getAttribute("href"))); curl_setopt($this->ch, CURLOPT_URL, $link); $linked_html = curl_exec($this->ch); $this->chackNotEmpty($linked_html, $this->ch); // Get the size in kb of the linked HTML without assets $sizeBites = strlen($linked_html); $size = $sizeBites > 1024 ? round($sizeBites / 1024, 2) . "kb" : $sizeBites . "b"; // Load linked HTML $dom_linked_html = new DOMDocument(); @$dom_linked_html->loadHtml($linked_html); $xpath_linked_html = new DOMXPath($dom_linked_html); $node_description = $xpath_linked_html->query("//div[@class='productText']//p"); // Get Product Description $node_description = $xpath_linked_html->query("//div[contains(@class, 'Text')]"); $description = trim(preg_replace("/[\r\n]+/", " ", $node_description->item(0)->nodeValue)); // Get price unit HTML $p = $xpath->query('div//p[@class="pricePerUnit"]', $product); $price = substr(trim(preg_replace("/[\r\n]+/", " ", $p->item(0)->nodeValue)), 2, 4); $total += $price; // Get the final array $results[] = array('title' => $text, 'size' => $size, 'unit_price' => $price, 'description' => $description); } // Generate JSON return json_encode(array('results' => $results, 'total' => $total), JSON_PRETTY_PRINT); }
public function scrape($input) { // Suppress warnings relating to XML markup libxml_use_internal_errors(true); libxml_clear_errors(); $doc = new DOMDocument(); $doc->loadHtml($input); $xpath = new DOMXPath($doc); $nodes = $xpath->query("//div[contains(concat(' ',@class,' '),' concept_light ')]"); foreach ($nodes as $i => $result) { // Get Japanese Word $readings = $xpath->query("div/div/div[contains(concat(' ',@class,' '),' concept_light-representation ')]/span[contains(concat(' ',@class,' '),' text ')]", $result); foreach ($readings as $word) { $alfred_results[$i]['ja'] = trim($word->nodeValue); } // Get result details $definitions = $xpath->query("div[contains(concat(' ',@class,' '),' concept_light-meanings ')]/div[contains(concat(' ',@class,' '),' meanings-wrapper ')]", $result); foreach ($definitions as $j => $definition) { // Get type (verb, noun, etc) $types = $xpath->query("div[contains(concat(' ',@class,' '),' meaning-tags ')]", $definition); $type_arr = array(); foreach ($types as $type) { $alfred_results[$i]['type'] = $this->getType(trim(strtolower($type->nodeValue))); break; } // Get definitions $words = $xpath->query("div[contains(concat(' ',@class,' '),' meaning-wrapper ')]/div[contains(concat(' ',@class,' '),' meaning-definition ')]/span[contains(concat(' ',@class,' '),' meaning-meaning ')]", $definition); foreach ($words as $z => $word) { $alfred_results[$i]['en'][] = trim($word->nodeValue); } } // Get Link $anchors = $xpath->query("a[contains(concat(' ',@class,' '),' light-details_link ')]", $result); foreach ($anchors as $anchor) { $alfred_results[$i]['url'] = $anchor->getAttribute("href"); } } return $alfred_results; }
function parseEntries($input, array &$resultArray) { $wordDoc = new DOMDocument(); $wordDoc->loadHtml($input); $wordXPath = new DOMXPath($wordDoc); $elements = $wordXPath->query("//table/*/tr/td[@class='text']"); $output = ""; $i = 1; $resultEntry = new ParserResult(); foreach ($elements as $element) { $subElements = $element->getElementsByTagName("small"); do { $moreTags = $this->removeSmallTags($element); } while ($moreTags->length != 0); $this->removeSmallTags($element); $isSearchWord = $this->isSearchWord($element); $languageCode = $this->getLanguageCode($element); $value = utf8_decode(trim($element->nodeValue)); if ($i % 2 != 0) { if ($isSearchWord) { $resultEntry->originalWord = $value; } else { $resultEntry->languageCode = $languageCode; $resultEntry->translatedWord = $value; } } else { if ($resultEntry->languageCode == "") { $resultEntry->languageCode = $languageCode; $resultEntry->translatedWord = $value; } else { $resultEntry->originalWord = $value; } array_push($resultArray, $resultEntry); $resultEntry = new ParserResult(); } $i++; } }
function get_ris_sessions($year, $month) { $html = get_ris_html($year, $month); if ($html === false) { echo 'connection error'; } else { $oldSetting = libxml_use_internal_errors(true); libxml_clear_errors(); $dom = new DOMDocument(); $dom->loadHtml($html); $tbody = $dom->getElementsByTagName('tbody'); $trs = $tbody[0]->getElementsByTagName('tr'); $sessions = array(); $lastdate = 0; foreach ($trs as $tr) { $tds = $tr->getElementsByTagName('td'); // Datum ermitteln // in leeren Zeilen zählt das Datum das weiter oben in dieser Spalte schon angezeigt wurde $day = intval(trim($tds[3]->textContent)); if ($day > 0) { $lastdate = sprintf("%'.02d", $day); } // Dokumenten links ermitteln $links = get_links_from_td($tds[8]); if (count($links) > 0) { $sessions[] = array('datum' => $lastdate . '.' . $month . '.' . $year, 'Sitzung' => $tds[5]->nodeValue, 'links' => $links); } } libxml_clear_errors(); libxml_use_internal_errors($oldSetting); if (count($sessions) > 0) { return $sessions; } else { return array(); } } return false; }