protected static function getUrlsFromSitemap($sitemapLocation) { $sitemap = HtmlDomParser::file_get_html($sitemapLocation); $urls = []; foreach ($sitemap->find('loc') as $loc) { $urls[] = $loc->innertext; } return $urls; }
function get_news() { $news_page = HtmlDomParser::file_get_html("http://p-karaj.tvu.ac.ir/"); $elems = $news_page->find("#simple-list_11643 ", 0); echo $elems->plaintext; $link = $elems[0]->href; $fixLink = str_replace('./', '/', $link); //echo $fixLink; $behe = "http://p-karaj.tvu.ac.ir"; }
protected function downloadUrl($url) { $html = ''; $html = HtmlDomParser::file_get_html($url); if ($html == '') { exec("wget -qO- " . $url . " 2>&1", $wget_result); $html = HtmlDomParser::str_get_html($wget_result); } return $html; }
/** * Méthode pour récupérer les informations d'une page par son URL. * * @param string $url L'adresse complète de la page. * * @return array Un tableau contenant les informations. * * @throws \InvalidArgumentException Si l'adresse est mal formatée. * @throws \RuntimeException Si la page ne peut pas être traitée. */ public function fetch($url) { // On contrôle que c'est bien une URL. if (!$this->testUrl($url)) { throw new \InvalidArgumentException($this->text->sprintf('APP_ERROR_BAD_URL', $url)); } // On récupère le contenu de l'adresse. $dom = @HtmlDomParser::file_get_html($url); // Si une erreur est survenue. if (empty($dom)) { throw new \RuntimeException($this->text->sprintf('APP_ERROR_UNABLE_TO_LOAD_URL', $url)); } // On récupère le titre de la page. $page_title = $dom->find('title')[0]->text(); // On récupère les balises meta. $metas = []; foreach ($dom->find('head')[0]->find('meta') as $element) { foreach ($this->meta as $meta) { if ($element->hasAttribute($meta['key'])) { if (strtolower($element->getAttribute($meta['key'])) == $meta['tag']) { $content = $element->getAttribute('content'); if (!empty($content)) { $metas[$meta['name']] = $content; } } } } } // On récupère le contenu de la page. $body = trim($dom->find('body')[0]->plaintext); $body = preg_replace('/\\s+/', ' ', $body); $pos = strpos($body, ' ', 200); $body = substr($body, 0, $pos); // On récupère les images. $images = []; foreach ($dom->find('img') as $element) { // On teste que c'est bien une URL valide. if ($this->testUrl($element->src)) { // On ne prend que les extensions images. $parts = UriHelper::parse_url($element->src); if (in_array($this->file_ext($parts['path']), $this->image_extensions)) { $images[] = $parts['scheme'] . '://' . $parts['host'] . $parts['path']; } } } // Si on arrive ici c'est que tout s'est bien passé. return ['title' => $page_title, 'text' => $body, 'images' => $images, 'metas' => $metas]; }
public static function getPrice($url) { $parser = new HtmlDomParser(); $dom = $parser->file_get_html($url); $price = $dom->find('div.pricelabel strong')[0]->plaintext; unset($dom); if (isset($price) && !empty($price)) { preg_match_all("/(\\d+)/", str_replace(" ", "", $price), $price); if (isset($price[0]) && !empty($price[0])) { return $price[0]; } else { return "0"; } } else { return "0"; } }
/** * Reads a HTML page * * @param $page * * @return bool|null */ public function ReadPage($page) { /** * Lets first get the page */ $page = HTMLDomParser::file_get_html($page); /** * Checks */ if (empty($page)) { return null; } /** * Else return false */ return $page; }
/** * Get players info * * @param int $offset * @param int|null $limit * @param \Closure|null $condition * @return array */ public function players($offset = 1, $limit = null) { $players = []; if (is_null($limit)) { $limit = $this->pages(); } foreach (range($offset, $limit) as $page) { $html = Html::file_get_html($this->url . $page); foreach ($html->find('.player-row') as $player) { $data = $this->player($player); if (null == $this->before or null != $this->before and call_user_func($this->before, $data)) { $players[] = null != $this->after ? call_user_func($this->after, $data) : $data; } } } return $players; }
private function getRSS(CraigslistRequest $request) { $body = file_get_contents($request->url()); $listings = simplexml_load_string(utf8_encode($body)); $results = []; foreach ($listings as $item) { $id = substr($item->link, -15, -5); if (!is_numeric($id)) { continue; } if ($this->remove_duplicates) { if (in_array($id, $this->ids) || in_array((string) $item->title, $this->titles)) { continue; } $this->ids[] = $id; $this->titles[] = (string) $item->title; } $results[$id] = ['id' => $id, 'link' => (string) $item->link, 'title' => (string) $item->title, 'description' => (string) $item->description]; if ($request->follow()) { $results[$id]['content'] = []; $dom = HtmlDomParser::file_get_html($item->link); @($results[$id]['date'] = $dom->find('time', 0)->datetime); @($results[$id]['page_title'] = $dom->find('.postingtitletext', 0)->innertext); @($results[$id]['location'] = str_replace(['(', ')'], '', $dom->find('.postingtitletext small', 0)->innertext)); @($results[$id]['price'] = $dom->find('.price', 0)->innertext); @($results[$id]['body'] = $dom->find('.postingbody, #postingbody, #postingBody', 0)->innertext); foreach ($dom->find('.attrgroup span') as $attr) { $results[$id]['attributes'][] = $attr->innertext; } foreach ($request->selectors as $selector) { $target = $selector['target']; foreach ($dom->find($selector['element']) as $k => $attr) { if (isset($selector['limit']) && $k > $selector['limit'] - 1) { continue; } $results[$id][$selector['label']][] = $attr->{$target}; } } } } return $results; }
public static function findMovies($str, $cinema_name) { $res = array(); $dom = HtmlDomParser::file_get_html($str); $cinemas = $dom->find(".movie_results"); foreach ($cinemas as $cinema) { foreach ($cinema->children() as $theater) { foreach ($theater->find('.desc') as $els) { foreach ($els->find('h2') as $title) { if (strtolower($title->text()) == $cinema_name) { foreach ($theater->find('.name') as $name) { $res[] = array($name->text()); } } } } } } return $res; }
public function downloadTorrentFile($torrentUrlCard) { $dom = HtmlDomParser::file_get_html($torrentUrlCard); $urlTorrentFile = $dom->find('a[id=telecharger]', 0)->href; $urlTorrentFile = self::CPASBIEN_BASE_URL . $urlTorrentFile; $curl = curl_init($urlTorrentFile); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_COOKIESESSION, true); $fileContent = curl_exec($curl); curl_close($curl); $filename = $torrentUrlCard; while (strpos($filename, '/') !== false) { $test = strpos($filename, '/'); $filename = substr($filename, strpos($filename, '/') + 1); } $test = strpos($filename, '/'); $filename = substr($filename, 0, strlen($filename) - 5) . '.torrent'; $filePath = $this->tmpFolder . '/' . $filename; file_put_contents($filePath, $fileContent); return $filePath; }
public function search($query) { $dom = HtmlDomParser::file_get_html(sprintf(self::SERVICE_URL, urlencode($query))); $songs = $dom->find('ul.songs li'); $results = array(); if ($songs) { /* @var $song \simple_html_dom_node */ foreach ($songs as $song) { $result = new mp3withResult(); $result->id = $song->attr['data-id']; $result->url = 'http://mp3with.co' . $song->attr['data-mp3']; $song = $song->find('.song', 0); if ($song) { $result->title = trim($song->find('strong', 0)->innertext); $result->artist = trim($song->find('strong.artist', 0)->innertext); } $results[] = $result; } } return $results; }
/** * Get party information from URL * * @param $url * @return array of party information defined as above */ protected function getPartiesInfo($url) { $html = HtmlDomParser::file_get_html($url); $party = []; foreach ($html->find('.borderbox1') as $index => $partyHtml) { $nameHtml = $partyHtml->find('h3.partytitle', 0); // If name section has website link if (strpos($nameHtml, 'href')) { $party[$index]['name'] = $nameHtml->find('a', 0)->innertext; $party[$index]['website'] = $nameHtml->find('a', 0)->href; } else { $party[$index]['name'] = $nameHtml->innertext; } // Get party info from left column $infoPartOneHtml = $partyHtml->find('div.colun', 0); $infoPartOneIndex = 0; $shortNameHtml = $infoPartOneHtml->find('p', $infoPartOneIndex++)->innertext; $party[$index]['short_name'] = $this->getAfterFirstBr($shortNameHtml); $leaderHtml = $infoPartOneHtml->find('p', $infoPartOneIndex++)->innertext; $party[$index]['leader'] = $this->getAfterFirstBr($leaderHtml); $headquartersHtml = $infoPartOneHtml->find('p', $infoPartOneIndex++)->innertext; $party[$index]['headquarters'] = $this->getAfterFirstBr($headquartersHtml); // Get party info from right column $infoPartTwoHtml = $partyHtml->find('div.coldeux', 0); $infoPartTwoIndex = 0; $eligibleHtml = $infoPartTwoHtml->find('p', $infoPartTwoIndex++)->innertext; $party[$index]['eligible_date'] = $this->getAfterSpan($eligibleHtml); $registeredHtml = $infoPartTwoHtml->find('p', $infoPartTwoIndex++)->innertext; $party[$index]['registered_date'] = $this->getAfterSpan($registeredHtml); if (strpos($infoPartTwoHtml->innertext, 'Deregistered')) { $deregisteredHtml = $infoPartTwoHtml->find('p', $infoPartTwoIndex++)->innertext; $party[$index]['deregistered_date'] = $this->getAfterSpan($deregisteredHtml); } $chefAgentHtml = $infoPartTwoHtml->find('p', $infoPartTwoIndex++)->innertext; $party[$index]['chef_agent'] = $this->getAfterFirstBr($chefAgentHtml); $auditorHtml = $infoPartTwoHtml->find('p', $infoPartTwoIndex++)->innertext; $party[$index]['auditor'] = $this->getAfterFirstBr($auditorHtml); } return $party; }
public function getFind() { $url_for_check = App\Film::where('check', '0')->orderBy('id', 'desc')->first(); $url_for_check->check = 1; $url_for_check->save(); if (parse_url($url_for_check->url)['host'] == 'filmiha.com' && substr(parse_url($url_for_check->url)['path'], 1, 3) != 'tag') { $dom = HtmlDomParser::file_get_html($url_for_check->url); foreach ($dom->find('a') as $link) { $href = $link->href; $hrefs = App\Film::where('url', $href); if ($hrefs->count() == 0) { $new_href = new App\Film(); if (parse_url($href)['host'] != 'filmiha.com' || substr(parse_url($href)['path'], 1, 3) == 'tag') { $new_href->check = 1; } $new_href->url = $href; $new_href->save(); } } } dd("Operation Was Successful! :) "); }
protected static function getPhoneNumber($url) { $out = null; $parser = new HtmlDomParser(); $dom = $parser->file_get_html($url); $uuid = $dom->find('div.rel ul.brbott-12 li'); unset($dom); if (isset($uuid) && !empty($uuid)) { preg_match("/\\'id\\'\\:\\'(.*?)\\'/", $uuid[0]->class, $uuid); $uuid = explode(':', $uuid[0]); $uuid = str_replace("'", "", $uuid[1]); if ($curl = curl_init()) { curl_setopt($curl, CURLOPT_URL, 'http://olx.ua/ajax/misc/contact/phone/' . $uuid . '/white/'); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_POST, true); curl_setopt($curl, CURLOPT_POSTFIELDS, ""); $out = curl_exec($curl); curl_close($curl); } $phone = json_decode($out); if (isset($phone->value) && !empty($phone->value)) { if (preg_match("/<span\\sclass=\"block\">(.*)<\\/span\\>/", $phone->value)) { $ddom = $parser->str_get_html($phone->value); $phone = $ddom->find('span[class=block]')[0]->innertext; unset($ddom); return $phone; } else { return $phone->value; } } else { return false; } } else { return false; } }
public function getImage($returnJson = true) { $url = $_POST['url']; $crawlerId = $_POST['crawler']; $crawlerModel = new CrawlerModel(); $crawlerInfo = $crawlerModel->getCrawlerInfo($crawlerId); if (empty($url)) { return false; } if (empty($crawlerId)) { return false; } $crawlerModel = new CrawlerModel(); $source = $crawlerModel->getSourceById($crawlerInfo['source_type']); $dom = HtmlDomParser::file_get_html($url); $images = $dom->find($source['main_image']); $imgSrc = $images[0]->src; /** * find more sources */ $moreSources = array(); $moreFromBlock = $dom->find('.domainLinkWrapper'); if (isset($moreFromBlock[0])) { $moreSources[] = $moreFromBlock[0]->href; } //paged Collection (suggested boards under pin $pagedCollection = $dom->find('.PagedCollection'); if (isset($pagedCollection[0]) && !empty($pagedCollection[0])) { $boardLinkWrapper = $pagedCollection[0]->find('.boardLinkWrapper'); foreach ($boardLinkWrapper as $blw) { $moreSources[] = $blw->href; } } $imgData = array('image' => $imgSrc, 'more-sources' => $moreSources); if ($returnJson) { echo json_encode($imgData); exit; } else { return $imgData; } }
public function init($link = null, $dom = null) { $this->link = $link; $this->dom = HtmlDomParser::file_get_html($this->link); }
/** * @param $baseUrl * @param $subcategory */ private function saveParseRst($baseUrl, $subcategory) { set_time_limit(0); error_reporting(E_ALL & ~E_NOTICE); $j = 1; while ($j <= 1000) { $url = $baseUrl . '&start=' . $j; $parser = new HtmlDomParser(); $html = iconv('windows-1251', 'UTF-8//IGNORE', $parser->file_get_html($url)); $dom = $parser->str_get_html($html); $year = null; $fuel = null; $price = null; $link = null; $product = null; $city = null; $phone = null; for ($i = 0; $i < count($dom->find('div[class=rst-ocb-i]')) - 1; $i++) { $dparser = new HtmlDomParser(); $ddom = $dparser->str_get_html($dom->find('div[class=rst-ocb-i]')[$i]->innertext); $link = 'http://rst.ua' . $ddom->find('a.rst-ocb-i-a')[0]->href; preg_match("/(.*)/", $ddom->find('li[class=rst-ocb-i-d-l-i]')[1]->plaintext, $year); preg_match("/(.*)/", $ddom->find('li[class=rst-ocb-i-d-l-i]')[2]->plaintext, $fuel); preg_match("/(.*)/", $ddom->find('li[class=rst-ocb-i-d-l-j]')[0]->plaintext, $city); $product = $ddom->find('h3[class=rst-ocb-i-h]')[0]->plaintext; $phone_parser = $dparser->file_get_html($link); $phone = utf8_encode($phone_parser->find('p[class=rst-page-oldcars-item-option-block-container]')[0]->plaintext); if (isset($phone) && !empty($phone)) { preg_match("/\\d+/", $phone, $phone); $phone = $phone[0]; } else { $phone = utf8_encode($phone_parser->find('div.rst-page-oldcars-item-option-block-container td')[0]->plaintext); } unset($phone_parser); $price = str_replace("'", "", $ddom->find('span[class=rst-ocb-i-d-l-i-s rst-ocb-i-d-l-i-s-p]')[0]->plaintext); unset($dparser); if ($this->productUnique($link)) { preg_match("/\\((\\d+).*?\\)/", $year[0], $running); preg_match("/\\((.*?)\\)/", $fuel[0], $transmission); preg_match("/\\d+/", $year[0], $year); preg_match("/\\d+/", $phone[0], $phone); preg_match("/\\-?\\d+(\\.\\d{0,})?(.*?)\\(/", $fuel[0], $fuel); preg_match("/(\\d+)/", $price, $price); $city = explode(":", $city[0]); $model = new Items(); $model->product = $product; if (!empty($price)) { $model->price = $price[0]; } else { $model->options = "договорная"; $model->price = "0"; } $model->url = $link; $model->store = 'Rst'; $model->phone = $phone; $model->subcategory_id = $subcategory; $model->options .= '{"year":"' . trim($year[0]) . '","fuel":"' . trim($fuel[2]) . '","transmission":"' . trim($transmission[1]) . '","running":"' . trim($running[1]) . '","city":"' . trim($city[1]) . '","b/u":"1"}'; $model->save(); } } $j++; } }
public function init($propertyUrl = null, $dom = null) { $this->propertyUrl = $propertyUrl; $this->dom = HtmlDomParser::file_get_html($propertyUrl); }
<?php require "vendor/autoload.php"; use Sunra\PhpSimple\HtmlDomParser; $dom = HtmlDomParser::file_get_html("http://bilibili.com"); $elems = $dom->find('title', 0)->innertext; echo "<pre>"; var_dump($elems);
/** * @param Symbol $symbol * @param int $monthNumber * @throws WrongMonth * @return OptionPrice[] */ private function collectOptionPrices(Symbol $symbol, int $monthNumber) : array { if ($monthNumber < 1 || $monthNumber > 12) { throw new WrongMonth(); } $optionTableUrl = $this->createOptionTableUrl($symbol, $monthNumber); /** @var simple_html_dom $optionHtml */ $optionHtml = HtmlDomParser::file_get_html($optionTableUrl); if (count($optionHtml->find('span.error'))) { return []; } $futuresDataUrl = $this->createFuturesDataUrl($symbol, $monthNumber); /** @var simple_html_dom $optionHtml */ $futuresDataHtml = HtmlDomParser::file_get_html($futuresDataUrl); $futuresExpirationDate = strip_tags($optionHtml->find('#divContent table table tr td')[1]->text()); if (count($futuresDataHtml->find('span.error'))) { return []; } $futuresPrice = $this->makeFloat($futuresDataHtml->find('#dtaLast')[0]->text()); $futuresPrice52WeekNode = $futuresDataHtml->find('#main-content tr td td span strong'); $futuresPrice52WeekHigh = $this->makeFloat($futuresPrice52WeekNode[0]->text()); $futuresPrice52WeekLow = $this->makeFloat($futuresPrice52WeekNode[1]->text()); $futuresDataHtml->clear(); $futuresDataHtml = null; $priceLines = $optionHtml->find('.datatable_simple tr'); $priceLinesCount = count($priceLines); $futures = $this->futuresRepository->findOneBySymbolAndExpiration($symbol, $futuresExpirationDate); if (!$futures) { $futures = new Futures($symbol, $futuresExpirationDate); $this->futuresRepository->persist($futures); $this->futuresRepository->flush(); } $prices = []; foreach ($priceLines as $priceLineKey => $priceLine) { if ($priceLineKey < 2 || $priceLineKey > $priceLinesCount - 5) { continue; } $priceLineNodes = $priceLine->children; $callPriceNode = $priceLineNodes[0]; $strikeNode = $priceLineNodes[3]; $putPriceNode = $priceLineNodes[4]; $strike = $this->makeFloat($strikeNode->text()); $callPrice = $this->makeFloat($callPriceNode->text()); $putPrice = $this->makeFloat($putPriceNode->text()); if ($callPrice > 0) { $prices[] = $this->createOptionPrice(OptionContract::TYPE_CALL, $callPrice, $strike, $futuresPrice, $futuresPrice52WeekHigh, $futuresPrice52WeekLow, $futures); } if ($putPrice > 0) { $prices[] = $this->createOptionPrice(OptionContract::TYPE_PUT, $putPrice, $strike, $futuresPrice, $futuresPrice52WeekHigh, $futuresPrice52WeekLow, $futures); } } $optionHtml->clear(); $optionHtml = null; return $prices; }
$elems = $news_page->find("#simple-list_11643 ", 0); $message = $elems->plaintext; $response = $client->sendMessage(array('chat_id' => $chatid, 'text' => $message, 'reply_to_message_id' => $messageid)); break; case 'اخبار': case '/akhbar': case '/akhbar@BeheshtiNotifierBot': $params = array('chat_id' => $chatid, 'action' => 'typing'); $response = $client->sendChatAction($params); $news_page = HtmlDomParser::file_get_html("http://p-karaj.tvu.ac.ir/"); $elems = $news_page->find(".full-list article header a "); $link = $elems[0]->href; $fixLink = str_replace('./', '/', $link); $behe = "http://p-karaj.tvu.ac.ir"; $url = $behe . $fixLink; $dom = HtmlDomParser::file_get_html($url); $elems = $dom->find("#content article div", 0); $message = $elems->plaintext; $response = $client->sendMessage(array('chat_id' => $chatid, 'text' => $message, 'reply_to_message_id' => $messageid)); break; case 'کلاس جبرانی': case '/jobrani': case '/jobrani@BeheshtiNotifierBot': // $params = array('chat_id' => $chatid, 'action' => 'typing'); // $response = $client -> sendChatAction($params); // $news_page =HtmlDomParser::file_get_html( "http://p-karaj.tvu.ac.ir/" ); // $elems = $news_page->find("#simple-list_12031",0); // $message =$elems->plaintext; $response = $client->sendMessage(array('chat_id' => $chatid, 'text' => 'این بخش غیر فعال است', 'reply_to_message_id' => $messageid)); break; default:
public function actionRenderMetrics() { if (!$this->parsingDom) { $this->parsingDom = true; $oldPath = method_exists(craft()->templates, 'getTemplatesPath') ? craft()->templates->getTemplatesPath() : craft()->path->getTemplatesPath(); $newPath = craft()->path->getPluginsPath() . 'seomatic/templates'; method_exists(craft()->templates, 'setTemplatesPath') ? craft()->templates->setTemplatesPath($newPath) : craft()->path->setTemplatesPath($newPath); /* -- Render the SEOmatic display preview template */ $url = urldecode(craft()->request->getParam('url')); if (UrlHelper::isAbsoluteUrl($url)) { $urlParts = parse_url($url); if (isset($urlParts['scheme'])) { $rootUrl = $urlParts['scheme'] . "://" . $urlParts['host']; } else { $rootUrl = "http" . "://" . $urlParts['host']; } if (isset($urlParts['port'])) { $rootUrl .= $urlParts['port'] . "/"; } else { $rootUrl .= "/"; } $keywordsParam = urldecode(craft()->request->getParam('keywords')); $keywordsKeys = explode(",", $keywordsParam); $keywords = array(); /* -- Silly work-around for what appears to be a file_get_contents bug with https -> http://stackoverflow.com/questions/10524748/why-im-getting-500-error-when-using-file-get-contents-but-works-in-a-browser */ $opts = array('http' => array('header' => "User-Agent:Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13\r\n")); $context = stream_context_create($opts); $dom = HtmlDomParser::file_get_html($url, false, $context); if ($dom) { $textStatistics = new TS\TextStatistics(); /* -- See if robots.txt exists */ $hasRobotsTxt = false; $hasSitemap = false; $sitemapUrl = rtrim($rootUrl, '/') . "/sitemap.xml"; $foundSitemapUrl = ""; $robotsUrl = rtrim($rootUrl, '/') . "/robots.txt"; $robots = @file_get_contents($robotsUrl, false, $context); if ($robots !== false) { $hasRobotsTxt = true; $lines = explode("\n", $robots); foreach ($lines as $line) { $line = ltrim($line); $searchStr = 'Sitemap'; $pos = strpos($line, $searchStr); if ($pos !== false) { $pos += strlen($searchStr); $foundSitemapUrl = substr($line, $pos); $foundSitemapUrl = trim($sitemapUrl, ':'); $foundSitemapUrl = trim($sitemapUrl); } } } /* -- Check to see if a sitemap exists */ if ($foundSitemapUrl) { $siteMapContents = ""; $siteMapContents = @file_get_contents($sitemapUrl, false, $context, 0, 1); if ($siteMapContents !== false) { $hasSitemap = true; } } $siteMapContents = ""; $siteMapContents = @file_get_contents($sitemapUrl, false, $context, 0, 1); if ($siteMapContents !== false) { $hasSitemap = true; } /* -- See if the site is https */ $sslReturnCode = 0; $sslUrl = "https" . "://" . $urlParts['host']; if (isset($urlParts['port'])) { $sslUrl .= $sslUrl['port'] . '/'; } else { $sslUrl .= '/'; } $ch = curl_init($sslUrl); curl_setopt($ch, CURLOPT_NOBODY, true); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); $open_basedir = ini_get('open_basedir'); if (empty($open_basedir)) { curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); } curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13'); curl_exec($ch); $sslReturnCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); /* -- Check to see if the page is valid */ $validatorUrl = "https://validator.w3.org/check?uri=" . urlencode($url) . "&output=json"; $ch = curl_init(); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13'); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_URL, $validatorUrl); $validatorResult = curl_exec($ch); curl_close($ch); $validatorStatus = $validatorErrors = $validatorWarnings = ""; if ($validatorResult) { $searchStr = "X-W3C-Validator-Status: "; $pos = strpos($validatorResult, $searchStr); if ($pos !== false) { $pos += strlen($searchStr); $validatorStatus = substr($validatorResult, $pos, strpos($validatorResult, PHP_EOL, $pos) - $pos); } $searchStr = "X-W3C-Validator-Errors: "; $pos = strpos($validatorResult, $searchStr); if ($pos !== false) { $pos += strlen($searchStr); $validatorErrors = substr($validatorResult, $pos, strpos($validatorResult, PHP_EOL, $pos) - $pos); } $searchStr = "X-W3C-Validator-Warnings: "; $pos = strpos($validatorResult, $searchStr); if ($pos !== false) { $pos += strlen($searchStr); $validatorWarnings = substr($validatorResult, $pos, strpos($validatorResult, PHP_EOL, $pos) - $pos); } } $validatorUrl = "https://validator.w3.org/check?uri=" . urlencode($url); /* -- Check Google Pagespeed insights for desktop */ $pagespeedDesktopScore = ""; $pagespeedDesktopUrl = "https://www.googleapis.com/pagespeedonline/v2/runPagespeed?url=" . urlencode($url) . "&strategy=desktop"; $ch = curl_init(); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13'); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_URL, $pagespeedDesktopUrl); $pagespeedDesktopResult = curl_exec($ch); curl_close($ch); $pageSpeedPageStats = array(); if ($pagespeedDesktopResult) { $pagespeedJson = json_decode($pagespeedDesktopResult, true); if ($pagespeedJson) { if (!empty($pagespeedJson['pageStats'])) { $pageSpeedPageStats = $pagespeedJson['pageStats']; if (empty($pageSpeedPageStats['htmlResponseBytes'])) { $pageSpeedPageStats['htmlResponseBytes'] = 0; } if (empty($pageSpeedPageStats['cssResponseBytes'])) { $pageSpeedPageStats['cssResponseBytes'] = 0; } if (empty($pageSpeedPageStats['imageResponseBytes'])) { $pageSpeedPageStats['imageResponseBytes'] = 0; } if (empty($pageSpeedPageStats['javascriptResponseBytes'])) { $pageSpeedPageStats['javascriptResponseBytes'] = 0; } if (empty($pageSpeedPageStats['otherResponseBytes'])) { $pageSpeedPageStats['otherResponseBytes'] = 0; } $pageSpeedPageStats['totalResponseBytes'] = $pageSpeedPageStats['htmlResponseBytes'] + $pageSpeedPageStats['cssResponseBytes'] + $pageSpeedPageStats['imageResponseBytes'] + $pageSpeedPageStats['javascriptResponseBytes'] + $pageSpeedPageStats['otherResponseBytes']; } if (isset($pagespeedJson['responseCode']) && ($pagespeedJson['responseCode'] == "200" || $pagespeedJson['responseCode'] == "301" || $pagespeedJson['responseCode'] == "302")) { if (isset($pagespeedJson['ruleGroups']['SPEED']['score'])) { $pagespeedDesktopScore = intval($pagespeedJson['ruleGroups']['SPEED']['score']); } } } } $pagespeedDesktopUrl = "https://developers.google.com/speed/pagespeed/insights/?url=" . urlencode($url) . "&tab=desktop"; /* -- Check Google Pagespeed insights for desktop */ $pagespeedMobileScore = ""; $pagespeedMobileUsability = ""; $pagespeedMobileUrl = "https://www.googleapis.com/pagespeedonline/v2/runPagespeed?url=" . urlencode($url) . "&strategy=mobile"; $ch = curl_init(); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13'); curl_setopt($ch, CURLOPT_URL, $pagespeedMobileUrl); $pagespeedMobileResult = curl_exec($ch); curl_close($ch); if ($pagespeedMobileResult) { $pagespeedJson = json_decode($pagespeedMobileResult, true); if ($pagespeedJson) { if (isset($pagespeedJson['responseCode']) && ($pagespeedJson['responseCode'] == "200" || $pagespeedJson['responseCode'] == "301" || $pagespeedJson['responseCode'] == "302")) { if (isset($pagespeedJson['ruleGroups']['SPEED']['score'])) { $pagespeedMobileScore = intval($pagespeedJson['ruleGroups']['SPEED']['score']); } if (isset($pagespeedJson['ruleGroups']['USABILITY']['score'])) { $pagespeedMobileUsability = intval($pagespeedJson['ruleGroups']['USABILITY']['score']); } } } } $pagespeedMobileUrl = "https://developers.google.com/speed/pagespeed/insights/?url=" . urlencode($url) . "&tab=mobile"; /* -- Scrape for JSON-LD before we remove the <script> tags */ $jsonLdTypes = array(); foreach ($dom->find('script[type=application/ld+json]') as $elem) { $jsonArray = json_decode($elem->innertext, true); if (isset($jsonArray['@type'])) { array_push($jsonLdTypes, $jsonArray['@type']); } } $jsonLdTypes = array_unique($jsonLdTypes); /* -- Remove inline <script> and <style> tags, and then strip the DOM down */ foreach ($dom->find('style') as $element) { $element->outertext = ''; } foreach ($dom->find('script') as $element) { $element->outertext = ''; } $strippedDom = html_entity_decode($dom->plaintext); // $strippedDom = preg_replace('@[^0-9a-z\.\!]+@i', ', ', $strippedDom); $strippedDom = stripslashes($strippedDom); $htmlDom = html_entity_decode($dom->outertext); // $htmlDom = preg_replace('@[^0-9a-z\.\!]+@i', '', $htmlDom); /* -- SEO statistics */ $titleTag = html_entity_decode($dom->find('title', 0)->plaintext); $titleLength = strlen($titleTag); $metaDescriptionTag = ""; $metaDescriptionLength = 0; $elem = $dom->find('meta[name=description]', 0); if ($elem) { $metaDescriptionTag = html_entity_decode($elem->content); $metaDescriptionLength = strlen($metaDescriptionTag); } $metaTwitterTag = ""; $elem = $dom->find('meta[name=twitter:card],meta[property=twitter:card]', 0); if ($elem) { $metaTwitterTag = html_entity_decode($elem->content); } $metaOpenGraphTag = ""; $elem = $dom->find('meta[property=og:type],meta[property=og:url],meta[property=og:title]', 0); if ($elem) { $metaOpenGraphTag = html_entity_decode($elem->content); } $hasRelPublisherTag = false; $elem = $dom->find('link[rel=publisher]', 0); if ($elem) { $hasRelPublisherTag = true; } $emptyImageAlts = count($dom->find('img[!alt]')); $h1Tags = count($dom->find('h1')); $h2Tags = count($dom->find('h2')); $h3Tags = count($dom->find('h3')); $h4Tags = count($dom->find('h4')); $h5Tags = count($dom->find('h5')); $totalHTags = $h1Tags + $h2Tags + $h3Tags + $h4Tags + $h5Tags; $effectiveHTags = true; if ($h1Tags != 1) { $effectiveHTags = false; } if ($totalHTags < 3) { $effectiveHTags = false; } if ($h2Tags == 0 && ($h3Tags || $h4Tags || $h5Tags)) { $effectiveHTags = false; } if ($h3Tags == 0 && ($h4Tags || $h5Tags)) { $effectiveHTags = false; } if ($h4Tags == 0 && $h5Tags) { $effectiveHTags = false; } $textToHtmlRatio = strlen($strippedDom) / (strlen($htmlDom) - strlen($strippedDom)) * 100; $strippedDom = preg_replace('/\\s+/', ' ', $strippedDom); /* -- Extract the page keywords, and clean them up a bit */ $pageKeywords = craft()->seomatic->extractKeywords($strippedDom); $pageKeywords = str_replace(",,", ",", $pageKeywords); $pageKeywords = str_replace(" ,", ",", $pageKeywords); $pageKeywords = str_replace(" .", ".", $pageKeywords); $pageKeywords = preg_replace('/\\.+/', '.', $pageKeywords); $pageKeywords = preg_replace('/,+/', ',', $pageKeywords); $pageKeywords = str_replace(",.,", ",", $pageKeywords); $pageKeywords = html_entity_decode($pageKeywords, ENT_COMPAT, 'UTF-8'); /* -- Focus keywords */ foreach ($keywordsKeys as $keywordsKey) { $keywordsKey = trim($keywordsKey); if (strlen($keywordsKey)) { $appearsInH1Tag = 0; foreach ($dom->find('h1') as $element) { $appearsInH1Tag += substr_count(strtolower($element->plaintext), strtolower($keywordsKey)); } foreach ($dom->find('h2') as $element) { $appearsInH1Tag += substr_count(strtolower($element->plaintext), strtolower($keywordsKey)); } $appearsInImgTag = 0; foreach ($dom->find('img') as $element) { $appearsInImgTag += substr_count(strtolower($element->alt), strtolower($keywordsKey)); } $appearsInAhrefTag = 0; foreach ($dom->find('a') as $element) { $appearsInAhrefTag += substr_count(strtolower($element->plaintext), strtolower($keywordsKey)); } $keywords[$keywordsKey] = array('appearsInTitleTag' => substr_count(strtolower($titleTag), strtolower($keywordsKey)), 'appearsInUrl' => substr_count(strtolower($url), strtolower($keywordsKey)), 'appearsInMetaDescriptionTag' => substr_count(strtolower($metaDescriptionTag), strtolower($keywordsKey)), 'appearsInH1Tag' => $appearsInH1Tag, 'appearsInAhrefTag' => $appearsInAhrefTag, 'appearsInImgTag' => $appearsInImgTag, 'appearsInPageKeywords' => substr_count(strtolower($pageKeywords), strtolower($keywordsKey)), 'appearsOnWebPage' => substr_count(strtolower($strippedDom), strtolower($keywordsKey))); } } /* -- Text statistics */ $wordCount = $textStatistics->wordCount($strippedDom); $readingTime = floor($wordCount / 200); if ($readingTime === 0) { $readingTime = 1; } $fleschKincaidReadingEase = $textStatistics->fleschKincaidReadingEase($strippedDom); $fleschKincaidGradeLevel = $textStatistics->fleschKincaidGradeLevel($strippedDom); $gunningFogScore = $textStatistics->gunningFogScore($strippedDom); $colemanLiauIndex = $textStatistics->colemanLiauIndex($strippedDom); $smogIndex = $textStatistics->smogIndex($strippedDom); $automatedReadabilityIndex = $textStatistics->automatedReadabilityIndex($strippedDom); $vars = array('titleTag' => $titleTag, 'titleLength' => $titleLength, 'metaDescriptionTag' => $metaDescriptionTag, 'metaDescriptionLength' => $metaDescriptionLength, 'metaTwitterTag' => $metaTwitterTag, 'metaOpenGraphTag' => $metaOpenGraphTag, 'hasRelPublisherTag' => $hasRelPublisherTag, 'jsonLdTypes' => $jsonLdTypes, 'hasRobotsTxt' => $hasRobotsTxt, 'hasSitemap' => $hasSitemap, 'emptyImageAlts' => $emptyImageAlts, 'validatorUrl' => $validatorUrl, 'validatorStatus' => $validatorStatus, 'validatorErrors' => $validatorErrors, 'validatorWarnings' => $validatorWarnings, 'pageSpeedPageStats' => $pageSpeedPageStats, 'pagespeedDesktopScore' => $pagespeedDesktopScore, 'pagespeedDesktopUrl' => $pagespeedDesktopUrl, 'pagespeedMobileScore' => $pagespeedMobileScore, 'pagespeedMobileUsability' => $pagespeedMobileUsability, 'pagespeedMobileUrl' => $pagespeedMobileUrl, 'sslReturnCode' => $sslReturnCode, 'h1Tags' => $h1Tags, 'h2Tags' => $h2Tags, 'h3Tags' => $h3Tags, 'h4Tags' => $h4Tags, 'h5Tags' => $h5Tags, 'effectiveHTags' => $effectiveHTags, 'textToHtmlRatio' => $textToHtmlRatio, 'wordCount' => $wordCount, 'readingTime' => $readingTime, 'pageKeywords' => $pageKeywords, 'keywords' => $keywords, 'fleschKincaidReadingEase' => $fleschKincaidReadingEase, 'fleschKincaidGradeLevel' => $fleschKincaidGradeLevel, 'gunningFogScore' => $gunningFogScore, 'colemanLiauIndex' => $colemanLiauIndex, 'smogIndex' => $smogIndex, 'automatedReadabilityIndex' => $automatedReadabilityIndex); //$htmlText = craft()->templates->render('_seo_metrics.twig', $vars); $this->renderTemplate('_seo_metrics.twig', $vars); } else { $this->renderTemplate('_error', array('errorMessage' => "Error parsing the DOM. Is this a valid, publicly accessible URL?")); } } else { $this->renderTemplate('_error', array('errorMessage' => "Error loading the webpage. Is this a valid, publicly accessible URL?")); } method_exists(craft()->templates, 'setTemplatesPath') ? craft()->templates->setTemplatesPath($oldPath) : craft()->path->setTemplatesPath($oldPath); } $this->parsingDom = false; }
$settings_db = array('driver' => 'sqlite', 'database' => 'vdm_posts_db.sqlite', 'prefix' => ''); // Bootstrap Eloquent ORM $container = new Container(); $connFactory = new \Illuminate\Database\Connectors\ConnectionFactory($container); $conn = $connFactory->make($settings['database']); $resolver = new \Illuminate\Database\ConnectionResolver(); $resolver->addConnection('default', $conn); $resolver->setDefaultConnection('default'); \Illuminate\Database\Eloquent\Model::setConnectionResolver($resolver); $numberPostsAdded = 0; $page = 0; // delete all lines in table before starting \Post::truncate(); while ($numberPostsAdded < $nbPostMax) { // get page $dom = HtmlDomParser::file_get_html($settings['script']['url'] . '?page=' . $page); // parse all posts in page foreach ($dom->find('div.post') as $post) { if ($numberPostsAdded < $nbPostMax) { // simple php parser can't get directly an object associated with two classes // then we have to filter on article directly after if ($post->class == "post article") { // get id $id = $post->id; // get content which is in multiple tags $content = ''; foreach ($post->find('p', 0)->find('a') as $sentence) { $content .= $sentence->plaintext; } // get author and date whick are not in separated field $authorDate = $post->find('.right_part', 0)->find('p', 1);
$elems = $news_page->find(".full-list article header a "); $link = $elems[0]->href; $fixLink = str_replace('./', '/', $link); //echo $fixLink; $behe = "http://p-karaj.tvu.ac.ir"; $url = $behe . $fixLink; //echo $url; $dom = HtmlDomParser::file_get_html($url); $elems = $dom->find("#content article div", 0); $message = $elems->plaintext; $params = array('chat_id' => $chatid, 'action' => 'typing'); $response = $client->sendChatAction($params); $response = $client->sendMessage(array('chat_id' => $chatid, 'text' => $message, 'reply_to_message_id' => $messageid)); } if ($text == '/jobrani' || $text == '/jobrani@BeheshtiNotifierBot') { $news_page = HtmlDomParser::file_get_html("http://p-karaj.tvu.ac.ir/"); $elems = $news_page->find("#simple-list_12031", 0); $message = $elems->plaintext; $params = array('chat_id' => $chatid, 'action' => 'typing'); $response = $client->sendChatAction($params); $response = $client->sendMessage(array('chat_id' => $chatid, 'text' => $message, 'reply_to_message_id' => $messageid)); } //switch($text){ // // case '/hi': // // // // // //
/** * Retrieves the last $postCount from vdm * * @param int $postCount : the number of entries to retrieve * @return mixed */ protected function getLatestPosts($postCount) { // posts array is empty $posts = []; // starting at page 0 (source achitecture) $pageId = 0; // fetching until enough posts are retrieved while (Arrays::size($posts) < $postCount) { $dom = HtmlDomParser::file_get_html(sprintf($this->getBaseUrl(), $pageId)); $domPosts = $dom->find('.article'); $posts = Arrays::merge($posts, $domPosts); $pageId++; } // sorting by descending date and keeping only the $postCount first entries return Arrays::from($posts)->sort('date', 'desc')->first($postCount)->obtain(); }
public function parseActualFutures() { $urlAllFutures = $this->config["url"]["futuresall"]; $html = HtmlDomParser::file_get_html($urlAllFutures); // Обработка таблицы фьючерсов $tableEnergies = $html->find('table#dt2 tbody', 0); $symbols['CrudeOil'] = $tableEnergies->find('tr', 1)->find('td', 1)->plaintext; $symbols['NaturalGas'] = $tableEnergies->find('tr', 4)->find('td', 1)->plaintext; $tableGrains = $html->find('table#dt4 tbody', 0); $symbols['Wheat'] = $tableGrains->find('tr', 1)->find('td', 1)->plaintext; $symbols['Corn'] = $tableGrains->find('tr', 2)->find('td', 1)->plaintext; $symbols['Soybeans'] = $tableGrains->find('tr', 3)->find('td', 1)->plaintext; $tableIndexes = $html->find('table#dt5 tbody', 0); $symbols['Emini'] = $tableIndexes->find('tr', 1)->find('td', 1)->plaintext; $symbols['DJMini'] = $tableIndexes->find('tr', 3)->find('td', 1)->plaintext; $tableMetals = $html->find('table#dt7 tbody', 0); $symbols['Gold'] = $tableMetals->find('tr', 1)->find('td', 1)->plaintext; $symbols['Silver'] = $tableMetals->find('tr', 2)->find('td', 1)->plaintext; // Приведение символов к нужному виду foreach ($symbols as $key => $symbol) { $symbols[$key] = explode(' ', $symbol); $symbols[$key] = $symbols[$key][0]; $lastTwoSymbols = substr($symbols[$key], -2); $lastSymbol = substr($lastTwoSymbols, 1); $symbolsString = substr($symbols[$key], 0, -2); $symbols[$key] = $symbolsString . $lastSymbol; } return $symbols; }
public function setUrl($url) { $this->url = $url; $this->dom = HtmlDomParser::file_get_html($url); }
/** * 获取dom对象 * @param array $file 文档模型数据 * @return simple_html_dom dom对象 */ public function get_file($file) { $html = HtmlDomParser::file_get_html($file['url']); if ($html->root) { return $html; } else { return false; } }
# Fundamento $infraccion['sancion'] = parseSancion(html_entity_decode($tablaInfrancciones->find('td', 9)->plaintext)); #Sanción $GLOBALS['sumaAdeudos'] += $infraccion['sancion']['monto']; } else { $infraccion['motivo'] = html_entity_decode($tablaInfrancciones->find('td', 4)->plaintext); # Motivo $infraccion['fundamento'] = parseFundamento($tablaInfrancciones->find('td', 6)->plaintext); # Fundamento $infraccion['sancion'] = parseSancion(html_entity_decode($tablaInfrancciones->find('td', 8)->plaintext)); #Sanción } $infracciones[] = $infraccion; } # Adeudos tenencia $dom = HtmlDomParser::file_get_html(URL_TENENCIA . $placas); $tdAdeudos = $dom->find('#tabla500 td'); $aniosAdeudosTenencia = array(); foreach ($tdAdeudos as $tdAdeudo) { if (preg_match('/^\\d{4}\\b/', $tdAdeudo->plaintext)) { $aniosAdeudosTenencia[] = $tdAdeudo->plaintext; } } # Si no tiene adeudos de tenencia, busca info general # TODO: Refactor se repite abajo if (empty($aniosAdeudosTenencia)) { $curl = new Curl(); $curl->post(URL_CALCULO_TENENCIA, array('placa' => $placas, 'ejercicio' => 2015)); $jsonCalculoTenencia = json_decode($curl->response, true); $infoAuto['modelo'] = (int) $jsonCalculoTenencia['modelo']; if ($jsonCalculoTenencia['procedencia'] == 'N') {