Exemple #1
0
 /**
  * Start the crawler to retrieve pages from a given news website
  * @param type $nrOfDaysBack The nr of days the crawler should go back (counting from today)
  * @param type $newsSiteUrl The root URL of the news site (the seed of the crawler)
  * @return type
  */
 public function crawlForNews($nrOfDaysBack, $newsSiteUrl, $timeToLive, $startDate = null)
 {
     $crawler = new Crawler($newsSiteUrl, $timeToLive);
     if ($startDate) {
         $crawler->crawl($nrOfDaysBack, $startDate);
     } else {
         $crawler->crawl($nrOfDaysBack);
     }
     return count($crawler->getCrawled());
 }
 public function getHotSpots()
 {
     $crawler = new Crawler($this);
     $outlines = new CrawlerOutlineCollection();
     $size = $this->image->size();
     for ($x = 0; $x < $size[0]; $x++) {
         for ($y = 0; $y < $size[1]; $y++) {
             $pixel = $this->pixel($x, $y);
             // Skip white pixels
             if ($pixel->color()->compare(ImageColor::white(), 5)) {
                 continue;
             }
             // Skip crawled areas
             if ($outlines->contains($pixel)) {
                 continue;
             }
             // Start crawling
             $outline = $crawler->crawl($x, $y);
             $outlines->push($outline);
         }
     }
     $hotspots = new ImageCollection();
     foreach ($outlines as $outline) {
         $hotspots->push($this->image->sliceByOutline($outline));
     }
     return array($hotspots, $outlines);
 }
Exemple #3
0
 /**
  * Parsing
  *
  * @throws Exception
  */
 public function crawl($url)
 {
     $crawler = new Crawler();
     $crawler->on($crawler::EVENT_HIT_CRAWL, function ($href, DOMDocument $dom) {
         $start = microtime(true);
         $imgLength = $dom->getElementsByTagName('img')->length;
         $time = microtime(true) - $start;
         $processTime = sprintf('%.6F', $time);
         $this->report[] = ['href' => $href, 'imgLength' => $imgLength, 'processTime' => $processTime];
         $this->show('  - ' . $href . ' [img: ' . $imgLength . ']' . PHP_EOL);
     });
     $crawler->on($crawler::EVENT_BEFORE_CRAWL, function () {
         $this->show('Start crawl' . PHP_EOL);
     });
     $crawler->on($crawler::EVENT_AFTER_CRAWL, function () {
         $this->show('Finish crawl' . PHP_EOL);
     });
     $crawler->crawl($url);
 }
Exemple #4
0
    define('DIR_ROOT', dirname(__FILE__));
}
if (!defined('DIR_KVZLIB')) {
    $lookIn = array('/Users/kevin/workspace/kvzlib', '/home/kevin/workspace/kvzlib', DIR_ROOT . '/ext/kvzlib');
    foreach ($lookIn as $dir) {
        if (is_dir($dir) && file_exists($dir . '/kvzlib.php')) {
            define('DIR_KVZLIB', $dir);
            break;
        }
    }
    if (!defined('DIR_KVZLIB')) {
        trigger_error('KvzLib not found in either: ' . implode(', ', $lookIn), E_USER_ERROR);
    }
}
define('IMDBPHP_CONFIG', DIR_ROOT . '/config/imdb.php');
ini_set("include_path", DIR_KVZLIB . ":" . DIR_ROOT . ":" . ini_get("include_path"));
require_once DIR_KVZLIB . '/php/classes/KvzShell.php';
require_once DIR_KVZLIB . '/php/classes/KvzHTML.php';
require_once DIR_KVZLIB . '/php/all_functions.php';
require_once DIR_ROOT . '/libs/crawler.php';
require_once DIR_ROOT . '/libs/movie.php';
require_once DIR_ROOT . '/libs/store.php';
require_once 'imdb.class.php';
$outDir = '/home/kevin/Dropbox/Public/cinema';
$outFile = 'kijken.html';
$crawlerOptions = array('dir' => '/data/moviesHD', 'minSize' => '600M', 'cachedir' => DIR_ROOT . '/cache', 'photodir' => $outDir . '/images');
$Crawler = new Crawler($crawlerOptions);
$movies = $Crawler->crawl();
$Store = new Store($movies, 'html', array('photovirt' => 'images', 'outputdir' => $outDir, 'outputfile' => $outFile, 'separate_on_dir' => 1));
$Store->save();
#$Store->output();
Exemple #5
0
 /**
  * @param string $url
  * @param string $rawHTML
  */
 public function extractContent($url, $rawHTML = null)
 {
     $crawler = new Crawler($this->config);
     $article = $crawler->crawl($url, $rawHTML);
     return $article;
 }
Exemple #6
0
 /**
  * Generate sitemap
  *
  * @return $this
  */
 public function generate()
 {
     $this->crawler->crawl();
     foreach ($this->crawler->getFoundUrls() as $url => $status) {
         if ($status == \Magelight\Sitemap\Models\Crawler::STATUS_SUCCESS) {
             $this->urls[] = ['loc' => $url, 'priority' => $this->getUrlPriority($url), 'changefreq' => $this->getUrlChangeFrequency($url)];
         }
     }
     return $this;
 }