loadFromUrl() public method

Use a curl interface implementation to attempt to load the content from a url.
public loadFromUrl ( string $url, array $options = [], phphtmlparser\CurlInterface $curl = null )
$url string
$options array
$curl phphtmlparser\CurlInterface
Exemplo n.º 1
0
 /**
  * @inheritdoc
  */
 protected function collectObjects($url)
 {
     if (!array_key_exists($url, $this->collectedCount)) {
         $this->collectedCount[$url] = 0;
     }
     $host = 'http://' . parse_url($url, PHP_URL_HOST);
     $dom = new Dom();
     try {
         $dom->loadFromUrl($url, [], GlabsController::$curl);
     } catch (CurlException $e) {
         if (false === strpos($e->getMessage(), 'timed out')) {
             throw new CurlException($e->getMessage());
         }
         GlabsController::showMessage(' ...trying again', false);
         return $this->collectObjects($url);
     }
     if (false !== strpos($dom, 'This IP has been automatically blocked.')) {
         throw new CurlException('IP has been blocked.');
     }
     // end collect. no results
     if ($dom->find('#moon')[0]) {
         return true;
     }
     $this->checkTotalObjects($dom);
     /* @var \PHPHtmlParser\Dom\AbstractNode $span */
     foreach ($dom->find('.txt') as $span) {
         if ($this->isEnoughCollect()) {
             break;
         }
         /* @var \PHPHtmlParser\Dom\AbstractNode $link */
         if ($link = $span->find('a')[0]) {
             $href = $this->checkObjectLink($host, $link->getAttribute('href'));
             if (false === $href) {
                 continue;
             }
             $title = $link->text() ?: strip_tags($link->innerHtml());
             try {
                 $object = $this->getObjectModel($url, $href, $title, $this->categoryId, $this->type);
                 $object->setPrice($span);
             } catch (ObjectException $e) {
                 continue;
             }
             $this->collected[] = $href;
             $this->objects[] = $object;
             $this->collectedCount[$url]++;
             BaseSite::$doneObjects++;
             BaseSite::progress();
         }
     }
     if (!$this->isEnoughCollect()) {
         $curl = GlabsController::$curl;
         $curl::$referer = $url;
         $url = str_replace([self::$pageParam . self::$page, '#list'], '', $url);
         self::$page += 100;
         return $this->collectObjects($this->getPagedUrl($url));
     }
     return true;
 }
Exemplo n.º 2
0
 /**
  * @inheritdoc
  */
 protected function collectObjects($url)
 {
     if (!array_key_exists($url, $this->collectedCount)) {
         $this->collectedCount[$url] = 0;
     }
     $dom = new Dom();
     try {
         $dom->loadFromUrl($url, [], GlabsController::$curl);
     } catch (CurlException $e) {
         if (false === strpos($e->getMessage(), 'timed out')) {
             throw new CurlException($e->getMessage());
         }
         if (false === strpos($e->getMessage(), '525')) {
             throw new CurlException($e->getMessage());
         }
         GlabsController::showMessage(' ...trying again', false);
         return $this->collectObjects($url);
     }
     // end collect. no results
     if (false !== strpos($dom, 'No matches found.')) {
         return true;
     }
     $this->checkTotalObjects($dom);
     /* @var \PHPHtmlParser\Dom\AbstractNode $span */
     foreach ($dom->find('.summaryHeader') as $span) {
         if ($this->isEnoughCollect()) {
             break;
         }
         /* @var \PHPHtmlParser\Dom\AbstractNode $link */
         if ($link = $span->find('a', 0)) {
             $href = $link->getAttribute('href');
             if (in_array($href, $this->collected, true)) {
                 continue;
             }
             $object = new Object($url, $href, $link->text(), $this->categoryId, $this->type);
             try {
                 $object->setPrice();
             } catch (ObjectException $e) {
                 continue;
             }
             $this->collected[] = $href;
             $this->objects[] = $object;
             $this->collectedCount[$url]++;
             BaseSite::$doneObjects++;
             BaseSite::progress();
         }
     }
     if (!$this->isEnoughCollect()) {
         $curl = GlabsController::$curl;
         $curl::$referer = $url;
         $url = str_replace(self::$pageParam . self::$page, '', $url);
         self::$page += self::$page ? 1 : 2;
         return $this->collectObjects($this->getPagedUrl($url));
     }
     return true;
 }
Exemplo n.º 3
0
 /**
  * Creates a new dom object and calls loadFromUrl() on the
  * new object.
  *
  * @param string $url
  * @param CurlInterface $curl
  * @return $this
  */
 public static function loadFromUrl($url, CurlInterface $curl = null)
 {
     $dom = new Dom();
     self::$dom = $dom;
     if (is_null($curl)) {
         // use the default curl interface
         $curl = new Curl();
     }
     return $dom->loadFromUrl($url, $curl);
 }
Exemplo n.º 4
0
 /**
  * @inheritdoc
  */
 protected function collectObjects($url)
 {
     if (!array_key_exists($url, $this->collectedCount)) {
         $this->collectedCount[$url] = 0;
     }
     $dom = new Dom();
     try {
         $dom->loadFromUrl($url, [], GlabsController::$curl);
     } catch (CurlException $e) {
         if (false === strpos($e->getMessage(), 'timed out')) {
             throw new CurlException($e->getMessage());
         }
         if (false === strpos($e->getMessage(), '525')) {
             throw new CurlException($e->getMessage());
         }
         GlabsController::showMessage(' ...trying again', false);
         return $this->collectObjects($url);
     }
     // end collect. no results
     if (false !== strpos($dom, 'No matches found') || false !== strpos($dom, 'Keine Entsprechungen gefunden') || false !== strpos($dom, 'No hay resultados') || false !== strpos($dom, 'Nessuna corrispondenza trovata') || false !== strpos($dom, 'Aucune correspondance n’a été trouvée') || false !== strpos($dom, 'Nenhuma correspondência encontrada') || false !== strpos($dom, 'Совпадений нет') || false !== strpos($dom, 'Ingen match fundet') || false !== strpos($dom, 'Nebyly nalezeny žádné shody') || false !== strpos($dom, 'Ingen match funnet') || false !== strpos($dom, 'Nie znaleziono') || false !== strpos($dom, 'Eşleşme bulunamadı') || false !== strpos($dom, 'Eredmény nem található') || false !== strpos($dom, 'Δεν βρέθηκαν εγγραφές') || false !== strpos($dom, 'Aucune correspondance n’a été trouvée')) {
         return true;
     }
     $this->checkTotalObjects($dom);
     /* @var \PHPHtmlParser\Dom\AbstractNode $span */
     foreach ($dom->find('.cat') as $span) {
         if ($this->isEnoughCollect()) {
             break;
         }
         /* @var \PHPHtmlParser\Dom\AbstractNode $link */
         if ($link = $span->find('a', 0)) {
             $href = $link->getAttribute('href');
             if (in_array($href, $this->collected, true)) {
                 continue;
             }
             try {
                 $object = new Object($url, $href, $link->text(), $this->categoryId, $this->type);
             } catch (ObjectException $e) {
                 continue;
             }
             $this->collected[] = $href;
             $this->objects[] = $object;
             $this->collectedCount[$url]++;
             BaseSite::$doneObjects++;
             BaseSite::progress();
         }
     }
     if (!$this->isEnoughCollect()) {
         $curl = GlabsController::$curl;
         $curl::$referer = $url;
         $url = str_replace(self::$pageParam . self::$page, '', $url);
         self::$page += self::$page ? 1 : 2;
         return $this->collectObjects($this->getPagedUrl($url));
     }
     return true;
 }
Exemplo n.º 5
0
 /**
  * Load DOM.
  *
  * @return bool
  *
  * @throws ObjectException
  */
 protected function loadDom()
 {
     try {
         $curl = GlabsController::$curl;
         $curl::$referer = $this->categoryUrl;
         self::$dom->loadFromUrl($this->url, [], GlabsController::$curl);
     } catch (CurlException $e) {
         if (false !== strpos($e->getMessage(), 'timed out')) {
             GlabsController::showMessage(' ...trying again', false);
             return $this->loadDom();
         }
         throw new ObjectException($e->getMessage());
     } catch (EmptyCollectionException $e) {
         throw new ObjectException($e->getMessage());
     }
     return true;
 }
Exemplo n.º 6
0
 public function testLoadFromUrl()
 {
     $curl = Mockery::mock('PHPHtmlParser\\CurlInterface');
     $curl->shouldReceive('get')->once()->with('http://google.com')->andReturn(file_get_contents('tests/files/small.html'));
     $dom = new Dom();
     $dom->loadFromUrl('http://google.com', [], $curl);
     $this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text);
 }
<?php

/*
 * YLE Areena video crawler
 * Find video links from a episode listing page and download them using yle-dl.
 * http://aajanki.github.io/yle-dl/
 * Run as a cron job to download episodes.
 * 0 1 * * * php -f /home/john/.cronscripts/yle_episode_downloader.php >> /home/john/.cronscripts/download.log
 *
 */
require __DIR__ . '/vendor/autoload.php';
use PHPHtmlParser\Dom;
$page_URL = "http://areena.yle.fi/1-2540138";
$saved_videos_folder = "/home/john/Videos/YLE/Yle_uutiset/";
$dom = new Dom();
$dom->loadFromUrl($page_URL);
// Find the dom element with videos and loop them through
$newslist = $dom->find('ul.program-list li');
if (count($newslist) >= 1) {
    foreach ($newslist as $news) {
        // Get video ID
        $data_item_id = $news->getAttribute('data-item-id');
        /* <time itemprop="startDate" datetime="2015-07-10T20:30:00.000+03:00"> */
        $timestamp_dom = $news->find('time[itemprop=startDate]');
        $timestamp = $timestamp_dom->getAttribute('datetime');
        $weekday = "_" . date('l', strtotime($timestamp));
        $pubDate = strftime("%Y-%m-%d_klo_%H.%M", strtotime($timestamp));
        $filename = "Yle_uutiset_" . $pubDate . $weekday . ".flv";
        $url = "http://areena.yle.fi/" . $data_item_id;
        if (!file_exists($saved_videos_folder . $filename)) {
            echo "\nDownloading video: " . $filename . "\n";
 /**
  * Imports a specific news entry
  * @param PHPHtmlParser\Dom $html
  * @return boolean|null
  */
 private function importNewsEntry($html, $origin)
 {
     $dom = new Dom();
     $uri = $html->getAttribute('href');
     $uid = str_replace('/galnet/uid/', '', $uri);
     $count = (new \yii\db\Query())->from('news')->where(['uid' => $uid])->count();
     if ((int) $count != 0) {
         $this->stdOut("    - {$uid} :: Already Imported...\n");
         return;
     }
     $dom->loadFromUrl(Yii::$app->params['galnet']['url'] . $uri);
     $title = trim(strip_tags($dom->find('h3.galnetNewsArticleTitle a')[0]->innerHtml));
     $content = trim(strip_tags(str_replace('<br /><br /> ', "\n", $dom->find('div.article p')[0]->innerHtml)));
     // Early Galnet posts are empty, so grab the first line from the article
     if (empty($title)) {
         $title = strtok($content, "\n");
     }
     $news = new News();
     $news->attributes = ['uid' => $uid, 'title' => $title, 'content' => $content, 'created_at' => time(), 'updated_at' => time(), 'published_at_native' => strtotime($origin), 'published_at' => strtotime($origin . "-1286 years")];
     $this->stdOut("    - {$uid}\n");
     $news->save();
 }
Exemplo n.º 9
0
<?php

include "vendor/autoload.php";
use PHPHtmlParser\Dom;
$head = "<!DOCTYPE html>\n <html><head>\n<meta charset='utf-8'>\n</head><body><table>";
file_put_contents("index.html", $head);
for ($i = 1; $i <= 2; $i++) {
    $url = "http://www.emls.ru/flats/page{$i}.html?query=s/1/place/address/reg/2/dept/2/sort1/1/dir1/2/sort2/3/dir2/1/interval/3";
    $dom = new Dom();
    $dom->loadFromUrl($url);
    $trs = $dom->find("table.html_table_1 tr");
    foreach ($trs as $tr) {
        file_put_contents("index.html", "<tr>", FILE_APPEND);
        $tds = $tr->find("td");
        foreach ($tds as $td) {
            file_put_contents("index.html", "<td>" . $td->text() . "</td>", FILE_APPEND);
        }
        file_put_contents("index.html", "</tr>", FILE_APPEND);
    }
}
$footer = "</table></body></html>";
file_put_contents("index.html", $footer, FILE_APPEND);
Exemplo n.º 10
0
<?php

require_once 'vendor/autoload.php';
use PHPHtmlParser\Dom;
$pageId =& $argv[1];
if (empty($pageId)) {
    die('Usage `php render.php GitHub` to generate rss feed for `https://www.facebook.com/GitHub`');
}
$url = sprintf('https://www.facebook.com/%s?_fb_noscript=1', urlencode($pageId));
$dom = new Dom();
$dom->loadFromUrl($url, ['whitespaceTextNode' => false], new \FbPageToRSS\Curl());
$parser = new \FbPageToRSS\FbParser($dom);
$feedGenerator = new \FbPageToRSS\FeedGenerator($parser);
$feedGenerator->setSkipFixedPost(true);
echo $feedGenerator->render();
Exemplo n.º 11
0
// Load and parse Sainsbury's Ripe Fruits webpage
$dom->loadFromUrl(URL, [], new \Curl());
// In order to find all the urls to the product pages, we need to
// find all the divs with the .product class
$products = $dom->find('#productsContainer .product');
echo "Total products: " . count($products) . PHP_EOL;
$product_response = new ProductResponse();
foreach ($products as $p) {
    // Get the URL to the products page
    $href = $p->find('.productInfo a')->getAttribute('href');
    echo 'Get content for: ' . $href . PHP_EOL;
    // Instantiate new DOM for product page.
    $pp_dom = new Dom();
    // Load and parse product page HTML
    $product_page_curl = new \Curl();
    $pp_dom->loadFromUrl($href, [], $product_page_curl);
    // Instantiate new Product
    $product = new Product();
    // Find and assign title to the Product
    $product->title = $pp_dom->find('.productSummary h1')->text();
    // Assign page size to the Product
    $product->size = $product_page_curl->getSize('kb');
    // Find unit_price then, remove £ and any whitespace
    // And assign it to the Product
    $product->unit_price = preg_replace('/£|\\s/', '', $pp_dom->find('.productSummary p.pricePerUnit')->text());
    // First we try extracting description from the div.productText next to
    // the h3.productDataItemHeader (with text = "Description").
    // If this fails (on some product pages) we get description
    // from the div.longTextItems within
    // div.itemTypeGroupContainer
    $div_description = $pp_dom->find('.productDataItemHeader');