/** * Start the crawler to retrieve pages from a given news website * @param type $nrOfDaysBack The nr of days the crawler should go back (counting from today) * @param type $newsSiteUrl The root URL of the news site (the seed of the crawler) * @return type */ public function crawlForNews($nrOfDaysBack, $newsSiteUrl, $timeToLive, $startDate = null) { $crawler = new Crawler($newsSiteUrl, $timeToLive); if ($startDate) { $crawler->crawl($nrOfDaysBack, $startDate); } else { $crawler->crawl($nrOfDaysBack); } return count($crawler->getCrawled()); }
public function getHotSpots() { $crawler = new Crawler($this); $outlines = new CrawlerOutlineCollection(); $size = $this->image->size(); for ($x = 0; $x < $size[0]; $x++) { for ($y = 0; $y < $size[1]; $y++) { $pixel = $this->pixel($x, $y); // Skip white pixels if ($pixel->color()->compare(ImageColor::white(), 5)) { continue; } // Skip crawled areas if ($outlines->contains($pixel)) { continue; } // Start crawling $outline = $crawler->crawl($x, $y); $outlines->push($outline); } } $hotspots = new ImageCollection(); foreach ($outlines as $outline) { $hotspots->push($this->image->sliceByOutline($outline)); } return array($hotspots, $outlines); }
/** * Parsing * * @throws Exception */ public function crawl($url) { $crawler = new Crawler(); $crawler->on($crawler::EVENT_HIT_CRAWL, function ($href, DOMDocument $dom) { $start = microtime(true); $imgLength = $dom->getElementsByTagName('img')->length; $time = microtime(true) - $start; $processTime = sprintf('%.6F', $time); $this->report[] = ['href' => $href, 'imgLength' => $imgLength, 'processTime' => $processTime]; $this->show(' - ' . $href . ' [img: ' . $imgLength . ']' . PHP_EOL); }); $crawler->on($crawler::EVENT_BEFORE_CRAWL, function () { $this->show('Start crawl' . PHP_EOL); }); $crawler->on($crawler::EVENT_AFTER_CRAWL, function () { $this->show('Finish crawl' . PHP_EOL); }); $crawler->crawl($url); }
define('DIR_ROOT', dirname(__FILE__)); } if (!defined('DIR_KVZLIB')) { $lookIn = array('/Users/kevin/workspace/kvzlib', '/home/kevin/workspace/kvzlib', DIR_ROOT . '/ext/kvzlib'); foreach ($lookIn as $dir) { if (is_dir($dir) && file_exists($dir . '/kvzlib.php')) { define('DIR_KVZLIB', $dir); break; } } if (!defined('DIR_KVZLIB')) { trigger_error('KvzLib not found in either: ' . implode(', ', $lookIn), E_USER_ERROR); } } define('IMDBPHP_CONFIG', DIR_ROOT . '/config/imdb.php'); ini_set("include_path", DIR_KVZLIB . ":" . DIR_ROOT . ":" . ini_get("include_path")); require_once DIR_KVZLIB . '/php/classes/KvzShell.php'; require_once DIR_KVZLIB . '/php/classes/KvzHTML.php'; require_once DIR_KVZLIB . '/php/all_functions.php'; require_once DIR_ROOT . '/libs/crawler.php'; require_once DIR_ROOT . '/libs/movie.php'; require_once DIR_ROOT . '/libs/store.php'; require_once 'imdb.class.php'; $outDir = '/home/kevin/Dropbox/Public/cinema'; $outFile = 'kijken.html'; $crawlerOptions = array('dir' => '/data/moviesHD', 'minSize' => '600M', 'cachedir' => DIR_ROOT . '/cache', 'photodir' => $outDir . '/images'); $Crawler = new Crawler($crawlerOptions); $movies = $Crawler->crawl(); $Store = new Store($movies, 'html', array('photovirt' => 'images', 'outputdir' => $outDir, 'outputfile' => $outFile, 'separate_on_dir' => 1)); $Store->save(); #$Store->output();
/** * @param string $url * @param string $rawHTML */ public function extractContent($url, $rawHTML = null) { $crawler = new Crawler($this->config); $article = $crawler->crawl($url, $rawHTML); return $article; }
/** * Generate sitemap * * @return $this */ public function generate() { $this->crawler->crawl(); foreach ($this->crawler->getFoundUrls() as $url => $status) { if ($status == \Magelight\Sitemap\Models\Crawler::STATUS_SUCCESS) { $this->urls[] = ['loc' => $url, 'priority' => $this->getUrlPriority($url), 'changefreq' => $this->getUrlChangeFrequency($url)]; } } return $this; }