/** * Get all snippets for a given Yahoo SERP page. * @param SimpleHtmlDom $SHDObject * @return array */ protected function getPageSnippets($SHDObject) { $snippets = array(); foreach ($SHDObject->find('.aAbs') as $object) { $snippetText = $this->cleanText($object->innertext); $snippets[] = $this->fixRepeatedSpace($snippetText); } // fetch only organic results return $this->normalizeResult($snippets); }
public static function str_get_html($str, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) { $dom = new SimpleHtmlDom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); if (empty($str) || strlen($str) > MAX_FILE_SIZE) { $dom->clear(); return false; } $dom->load($str, $lowercase, $stripRN); return $dom; }
/** * Get all snippets for a given Bing SERP. * @param SimpleHtmlDom $SHDObject * @return array */ protected function getPageSnippets($SHDObject) { $snippets = array(); // snippets in Bing's SERP are embedded into a <p> element child of b_caption foreach ($SHDObject->find('.b_caption p') as $object) { $snippetText = $this->cleanText($object->innertext); $snippets[] = $this->fixRepeatedSpace($snippetText); } // fetch only organic results return $this->normalizeResult($snippets); }
/** * Get all snippets for a given Google SERP. * @param SimpleHtmlDom $SHDObject * @return array */ protected function getPageSnippets($SHDObject) { $snippets = array(); // snippets in Google's SERP are embedded into a <span> element with class "st" foreach ($SHDObject->find('.st') as $object) { $snippetText = $this->cleanText($object->innertext); $snippets[] = $this->fixRepeatedSpace($snippetText); } // fetch only organic results return $this->normalizeResult($snippets); }
/** * get dom node's plain text * * @return string */ public function text() { if (isset($this->_[HDOM_INFO_INNER])) { return $this->_[HDOM_INFO_INNER]; } switch ($this->nodetype) { case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); case HDOM_TYPE_COMMENT: return ''; case HDOM_TYPE_UNKNOWN: return ''; } if (strcasecmp($this->tag, 'script') === 0) { return ''; } if (strcasecmp($this->tag, 'style') === 0) { return ''; } $ret = ''; // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. // WHY is this happening? if (null !== $this->nodes) { foreach ($this->nodes as $n) { $ret .= $n->text(); } // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. if ($this->tag == 'span') { $ret .= $this->dom->default_span_text; } } return $ret; }
private function makeup() { // text, comment, unknown if (isset($this->_[DomInfo::TEXT])) { return $this->dom->restore_noise($this->_[DomInfo::TEXT]); } $ret = '<' . $this->tag; $i = -1; foreach ($this->attr as $key => $val) { ++$i; // skip removed attribute if ($val === null || $val === false) { continue; } $ret .= $this->_[DomInfo::SPACE][$i][0]; //no value attr: nowrap, checked selected... if ($val === true) { $ret .= $key; } else { switch ($this->_[DomInfo::QUOTE][$i]) { case QuoteType::DOUBLE: $quote = '"'; break; case QuoteType::SINGLE: $quote = '\''; break; default: $quote = ''; } $ret .= $key . $this->_[DomInfo::SPACE][$i][1] . '=' . $this->_[DomInfo::SPACE][$i][2] . $quote . $val . $quote; } } $ret = $this->dom->restore_noise($ret); return $ret . $this->_[DomInfo::ENDSPACE] . '>'; }
/** * clean up memory due to php5 circular references memory leak... */ public function clear() { foreach ($this->nodes as $n) { $n->clear(); $n = null; } // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear. if (isset($this->children)) { /** @noinspection PhpWrongForeachArgumentTypeInspection */ foreach ($this->children as $n) { if (is_object($n)) { /** @noinspection PhpUndefinedMethodInspection */ $n->clear(); } $n = null; } } if (isset($this->parent)) { $this->parent->clear(); unset($this->parent); } if (isset($this->root)) { $this->root->clear(); unset($this->root); } unset($this->doc, $this->docArray, $this->noise, $this->parent, $this->root); }
public function set() { $data = file_get_contents('http://www.kongregate.com/contests?haref=hp_devcontest'); $dom = new SimpleHtmlDom(); $dom->load($data); $tables = []; $temp = []; foreach ($dom->find('table.contests') as $element) { foreach ($element->find('tr.js-game-hover') as $tr) { $temp[] = trim($tr->plaintext); } $tables[] = $temp; $temp = ''; } $this->_DB->query('INSERT INTO contests (data, snap_date) VALUES (:data, :snap_date)'); $this->_DB->bind(['data' => json_encode($tables), 'snap_date' => date('Y-m-d H:i:s')]); $this->_DB->execute(); return $tables; }
/** * get html dom from string * * @param $str * @param bool $forceTagsClosed * * @return bool|\voku\helper\SimpleHtmlDom */ public static function str_get_html($str, $forceTagsClosed = true) { $dom = new SimpleHtmlDom(null, $forceTagsClosed); $dom->load($str); return $dom; }
public function pasteListAction() { $html = Redokes_Controller_Front::getInstance()->getParam('html', ''); $parentId = intval(Redokes_Controller_Front::getInstance()->getParam('parentId', 0)); $trackId = intval(Redokes_Controller_Front::getInstance()->getParam('trackId', 0)); if ($trackId) { // check what kind of content was sent if (preg_match('/<ul|<ol/i', $html)) { // html list // convert any ol to ul $html = str_replace('<ol', '<ul', $html); $html = str_replace('</ol', '</ul', $html); // load up the html into a dom object $dom = new SimpleHtmlDom($html); $ul = $dom->find('>ul', 0); $this->makeFromUl($ul, $trackId, $parentId); } else if (preg_match('/[\n\r]/', $html)) { // multi line $this->makeFromNewLines($html, $trackId, $parentId); } else { // assume single line $this->makeFromNewLines($html, $trackId, $parentId); } return; $track = new Navigation_Model_Track($trackId); $track->clearCache(); } }
public static function getZipFromCity($city, $street) { //options for HTTP request $options = array('http' => array('header' => "Content-Type: application/x-www-form-urlencoded", 'method' => "GET", 'content' => http_build_query(array('city' => $city, 'street' => $street)))); //formulating HTTP request $context = stream_context_create($options); //loading of answer for HTTP request and parsing of requested html file $html = SimpleHtmlDom::file_get_html('http://psc.posta.sk/', false, $context); $zipMessageObject = $html->find('.zip'); //finding if (isset($zipMessageObject[0]->plaintext)) { $zip = $zipMessageObject[0]->plaintext; return $zip; } else { return "Nepodarilo sa najsť PSČ."; } }