示例#1
0
// Add some prefetch filters. These are executed before a resource is requested.
// The more you have of these, the less HTTP requests and work for the processors
$spider->addPreFetchFilter(new AllowedSchemeFilter(array('http')));
$spider->addPreFetchFilter(new AllowedHostsFilter(array($seed), $allowSubDomains));
$spider->addPreFetchFilter(new UriWithHashFragmentFilter());
$spider->addPreFetchFilter(new UriWithQueryStringFilter());
// We add an eventlistener to the crawler that implements a politeness policy. We wait 450ms between every request to the same domain
$politenessPolicyEventListener = new PolitenessPolicyListener(450);
$spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest'));
// Let's add a CLI progress meter for fun
echo "\nCrawling";
$spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_ENQUEUE, function (Event $event) {
    echo '.';
});
//// Set up some caching, logging and profiling on the HTTP client of the spider
$guzzleClient = $spider->getRequestHandler()->getClient();
$guzzleClient->addSubscriber($logPlugin);
$guzzleClient->addSubscriber($timerPlugin);
$guzzleClient->addSubscriber($cachePlugin);
// Set the user agent
$guzzleClient->setUserAgent('PHP-Spider');
// Execute the crawl
$result = $spider->crawl();
// Report
$stats = $spider->getStatsHandler();
$spiderId = $stats->getSpiderId();
$queued = $stats->getQueued();
$filtered = $stats->getFiltered();
$failed = $stats->getFailed();
echo "\n\nSPIDER ID: " . $spiderId;
echo "\n  ENQUEUED:  " . count($queued);