Use a curl interface implementation to attempt to load
the content from a url.
public loadFromUrl ( string $url, array $options = [], phphtmlparser\CurlInterface $curl = null ) | ||
$url | string | |
$options | array | |
$curl | phphtmlparser\CurlInterface |
/** * @inheritdoc */ protected function collectObjects($url) { if (!array_key_exists($url, $this->collectedCount)) { $this->collectedCount[$url] = 0; } $host = 'http://' . parse_url($url, PHP_URL_HOST); $dom = new Dom(); try { $dom->loadFromUrl($url, [], GlabsController::$curl); } catch (CurlException $e) { if (false === strpos($e->getMessage(), 'timed out')) { throw new CurlException($e->getMessage()); } GlabsController::showMessage(' ...trying again', false); return $this->collectObjects($url); } if (false !== strpos($dom, 'This IP has been automatically blocked.')) { throw new CurlException('IP has been blocked.'); } // end collect. no results if ($dom->find('#moon')[0]) { return true; } $this->checkTotalObjects($dom); /* @var \PHPHtmlParser\Dom\AbstractNode $span */ foreach ($dom->find('.txt') as $span) { if ($this->isEnoughCollect()) { break; } /* @var \PHPHtmlParser\Dom\AbstractNode $link */ if ($link = $span->find('a')[0]) { $href = $this->checkObjectLink($host, $link->getAttribute('href')); if (false === $href) { continue; } $title = $link->text() ?: strip_tags($link->innerHtml()); try { $object = $this->getObjectModel($url, $href, $title, $this->categoryId, $this->type); $object->setPrice($span); } catch (ObjectException $e) { continue; } $this->collected[] = $href; $this->objects[] = $object; $this->collectedCount[$url]++; BaseSite::$doneObjects++; BaseSite::progress(); } } if (!$this->isEnoughCollect()) { $curl = GlabsController::$curl; $curl::$referer = $url; $url = str_replace([self::$pageParam . self::$page, '#list'], '', $url); self::$page += 100; return $this->collectObjects($this->getPagedUrl($url)); } return true; }
/** * @inheritdoc */ protected function collectObjects($url) { if (!array_key_exists($url, $this->collectedCount)) { $this->collectedCount[$url] = 0; } $dom = new Dom(); try { $dom->loadFromUrl($url, [], GlabsController::$curl); } catch (CurlException $e) { if (false === strpos($e->getMessage(), 'timed out')) { throw new CurlException($e->getMessage()); } if (false === strpos($e->getMessage(), '525')) { throw new CurlException($e->getMessage()); } GlabsController::showMessage(' ...trying again', false); return $this->collectObjects($url); } // end collect. no results if (false !== strpos($dom, 'No matches found.')) { return true; } $this->checkTotalObjects($dom); /* @var \PHPHtmlParser\Dom\AbstractNode $span */ foreach ($dom->find('.summaryHeader') as $span) { if ($this->isEnoughCollect()) { break; } /* @var \PHPHtmlParser\Dom\AbstractNode $link */ if ($link = $span->find('a', 0)) { $href = $link->getAttribute('href'); if (in_array($href, $this->collected, true)) { continue; } $object = new Object($url, $href, $link->text(), $this->categoryId, $this->type); try { $object->setPrice(); } catch (ObjectException $e) { continue; } $this->collected[] = $href; $this->objects[] = $object; $this->collectedCount[$url]++; BaseSite::$doneObjects++; BaseSite::progress(); } } if (!$this->isEnoughCollect()) { $curl = GlabsController::$curl; $curl::$referer = $url; $url = str_replace(self::$pageParam . self::$page, '', $url); self::$page += self::$page ? 1 : 2; return $this->collectObjects($this->getPagedUrl($url)); } return true; }
/** * Creates a new dom object and calls loadFromUrl() on the * new object. * * @param string $url * @param CurlInterface $curl * @return $this */ public static function loadFromUrl($url, CurlInterface $curl = null) { $dom = new Dom(); self::$dom = $dom; if (is_null($curl)) { // use the default curl interface $curl = new Curl(); } return $dom->loadFromUrl($url, $curl); }
/** * @inheritdoc */ protected function collectObjects($url) { if (!array_key_exists($url, $this->collectedCount)) { $this->collectedCount[$url] = 0; } $dom = new Dom(); try { $dom->loadFromUrl($url, [], GlabsController::$curl); } catch (CurlException $e) { if (false === strpos($e->getMessage(), 'timed out')) { throw new CurlException($e->getMessage()); } if (false === strpos($e->getMessage(), '525')) { throw new CurlException($e->getMessage()); } GlabsController::showMessage(' ...trying again', false); return $this->collectObjects($url); } // end collect. no results if (false !== strpos($dom, 'No matches found') || false !== strpos($dom, 'Keine Entsprechungen gefunden') || false !== strpos($dom, 'No hay resultados') || false !== strpos($dom, 'Nessuna corrispondenza trovata') || false !== strpos($dom, 'Aucune correspondance n’a été trouvée') || false !== strpos($dom, 'Nenhuma correspondência encontrada') || false !== strpos($dom, 'Совпадений нет') || false !== strpos($dom, 'Ingen match fundet') || false !== strpos($dom, 'Nebyly nalezeny žádné shody') || false !== strpos($dom, 'Ingen match funnet') || false !== strpos($dom, 'Nie znaleziono') || false !== strpos($dom, 'Eşleşme bulunamadı') || false !== strpos($dom, 'Eredmény nem található') || false !== strpos($dom, 'Δεν βρέθηκαν εγγραφές') || false !== strpos($dom, 'Aucune correspondance n’a été trouvée')) { return true; } $this->checkTotalObjects($dom); /* @var \PHPHtmlParser\Dom\AbstractNode $span */ foreach ($dom->find('.cat') as $span) { if ($this->isEnoughCollect()) { break; } /* @var \PHPHtmlParser\Dom\AbstractNode $link */ if ($link = $span->find('a', 0)) { $href = $link->getAttribute('href'); if (in_array($href, $this->collected, true)) { continue; } try { $object = new Object($url, $href, $link->text(), $this->categoryId, $this->type); } catch (ObjectException $e) { continue; } $this->collected[] = $href; $this->objects[] = $object; $this->collectedCount[$url]++; BaseSite::$doneObjects++; BaseSite::progress(); } } if (!$this->isEnoughCollect()) { $curl = GlabsController::$curl; $curl::$referer = $url; $url = str_replace(self::$pageParam . self::$page, '', $url); self::$page += self::$page ? 1 : 2; return $this->collectObjects($this->getPagedUrl($url)); } return true; }
/** * Load DOM. * * @return bool * * @throws ObjectException */ protected function loadDom() { try { $curl = GlabsController::$curl; $curl::$referer = $this->categoryUrl; self::$dom->loadFromUrl($this->url, [], GlabsController::$curl); } catch (CurlException $e) { if (false !== strpos($e->getMessage(), 'timed out')) { GlabsController::showMessage(' ...trying again', false); return $this->loadDom(); } throw new ObjectException($e->getMessage()); } catch (EmptyCollectionException $e) { throw new ObjectException($e->getMessage()); } return true; }
public function testLoadFromUrl() { $curl = Mockery::mock('PHPHtmlParser\\CurlInterface'); $curl->shouldReceive('get')->once()->with('http://google.com')->andReturn(file_get_contents('tests/files/small.html')); $dom = new Dom(); $dom->loadFromUrl('http://google.com', [], $curl); $this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text); }
<?php /* * YLE Areena video crawler * Find video links from a episode listing page and download them using yle-dl. * http://aajanki.github.io/yle-dl/ * Run as a cron job to download episodes. * 0 1 * * * php -f /home/john/.cronscripts/yle_episode_downloader.php >> /home/john/.cronscripts/download.log * */ require __DIR__ . '/vendor/autoload.php'; use PHPHtmlParser\Dom; $page_URL = "http://areena.yle.fi/1-2540138"; $saved_videos_folder = "/home/john/Videos/YLE/Yle_uutiset/"; $dom = new Dom(); $dom->loadFromUrl($page_URL); // Find the dom element with videos and loop them through $newslist = $dom->find('ul.program-list li'); if (count($newslist) >= 1) { foreach ($newslist as $news) { // Get video ID $data_item_id = $news->getAttribute('data-item-id'); /* <time itemprop="startDate" datetime="2015-07-10T20:30:00.000+03:00"> */ $timestamp_dom = $news->find('time[itemprop=startDate]'); $timestamp = $timestamp_dom->getAttribute('datetime'); $weekday = "_" . date('l', strtotime($timestamp)); $pubDate = strftime("%Y-%m-%d_klo_%H.%M", strtotime($timestamp)); $filename = "Yle_uutiset_" . $pubDate . $weekday . ".flv"; $url = "http://areena.yle.fi/" . $data_item_id; if (!file_exists($saved_videos_folder . $filename)) { echo "\nDownloading video: " . $filename . "\n";
/** * Imports a specific news entry * @param PHPHtmlParser\Dom $html * @return boolean|null */ private function importNewsEntry($html, $origin) { $dom = new Dom(); $uri = $html->getAttribute('href'); $uid = str_replace('/galnet/uid/', '', $uri); $count = (new \yii\db\Query())->from('news')->where(['uid' => $uid])->count(); if ((int) $count != 0) { $this->stdOut(" - {$uid} :: Already Imported...\n"); return; } $dom->loadFromUrl(Yii::$app->params['galnet']['url'] . $uri); $title = trim(strip_tags($dom->find('h3.galnetNewsArticleTitle a')[0]->innerHtml)); $content = trim(strip_tags(str_replace('<br /><br /> ', "\n", $dom->find('div.article p')[0]->innerHtml))); // Early Galnet posts are empty, so grab the first line from the article if (empty($title)) { $title = strtok($content, "\n"); } $news = new News(); $news->attributes = ['uid' => $uid, 'title' => $title, 'content' => $content, 'created_at' => time(), 'updated_at' => time(), 'published_at_native' => strtotime($origin), 'published_at' => strtotime($origin . "-1286 years")]; $this->stdOut(" - {$uid}\n"); $news->save(); }
<?php include "vendor/autoload.php"; use PHPHtmlParser\Dom; $head = "<!DOCTYPE html>\n <html><head>\n<meta charset='utf-8'>\n</head><body><table>"; file_put_contents("index.html", $head); for ($i = 1; $i <= 2; $i++) { $url = "http://www.emls.ru/flats/page{$i}.html?query=s/1/place/address/reg/2/dept/2/sort1/1/dir1/2/sort2/3/dir2/1/interval/3"; $dom = new Dom(); $dom->loadFromUrl($url); $trs = $dom->find("table.html_table_1 tr"); foreach ($trs as $tr) { file_put_contents("index.html", "<tr>", FILE_APPEND); $tds = $tr->find("td"); foreach ($tds as $td) { file_put_contents("index.html", "<td>" . $td->text() . "</td>", FILE_APPEND); } file_put_contents("index.html", "</tr>", FILE_APPEND); } } $footer = "</table></body></html>"; file_put_contents("index.html", $footer, FILE_APPEND);
<?php require_once 'vendor/autoload.php'; use PHPHtmlParser\Dom; $pageId =& $argv[1]; if (empty($pageId)) { die('Usage `php render.php GitHub` to generate rss feed for `https://www.facebook.com/GitHub`'); } $url = sprintf('https://www.facebook.com/%s?_fb_noscript=1', urlencode($pageId)); $dom = new Dom(); $dom->loadFromUrl($url, ['whitespaceTextNode' => false], new \FbPageToRSS\Curl()); $parser = new \FbPageToRSS\FbParser($dom); $feedGenerator = new \FbPageToRSS\FeedGenerator($parser); $feedGenerator->setSkipFixedPost(true); echo $feedGenerator->render();
// Load and parse Sainsbury's Ripe Fruits webpage $dom->loadFromUrl(URL, [], new \Curl()); // In order to find all the urls to the product pages, we need to // find all the divs with the .product class $products = $dom->find('#productsContainer .product'); echo "Total products: " . count($products) . PHP_EOL; $product_response = new ProductResponse(); foreach ($products as $p) { // Get the URL to the products page $href = $p->find('.productInfo a')->getAttribute('href'); echo 'Get content for: ' . $href . PHP_EOL; // Instantiate new DOM for product page. $pp_dom = new Dom(); // Load and parse product page HTML $product_page_curl = new \Curl(); $pp_dom->loadFromUrl($href, [], $product_page_curl); // Instantiate new Product $product = new Product(); // Find and assign title to the Product $product->title = $pp_dom->find('.productSummary h1')->text(); // Assign page size to the Product $product->size = $product_page_curl->getSize('kb'); // Find unit_price then, remove £ and any whitespace // And assign it to the Product $product->unit_price = preg_replace('/£|\\s/', '', $pp_dom->find('.productSummary p.pricePerUnit')->text()); // First we try extracting description from the div.productText next to // the h3.productDataItemHeader (with text = "Description"). // If this fails (on some product pages) we get description // from the div.longTextItems within // div.itemTypeGroupContainer $div_description = $pp_dom->find('.productDataItemHeader');