// Add some prefetch filters. These are executed before a resource is requested. // The more you have of these, the less HTTP requests and work for the processors $spider->addPreFetchFilter(new AllowedSchemeFilter(array('http'))); $spider->addPreFetchFilter(new AllowedHostsFilter(array($seed), $allowSubDomains)); $spider->addPreFetchFilter(new UriWithHashFragmentFilter()); $spider->addPreFetchFilter(new UriWithQueryStringFilter()); // We add an eventlistener to the crawler that implements a politeness policy. We wait 450ms between every request to the same domain $politenessPolicyEventListener = new PolitenessPolicyListener(450); $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, array($politenessPolicyEventListener, 'onCrawlPreRequest')); // Let's add a CLI progress meter for fun echo "\nCrawling"; $spider->getDispatcher()->addListener(SpiderEvents::SPIDER_CRAWL_PRE_ENQUEUE, function (Event $event) { echo '.'; }); //// Set up some caching, logging and profiling on the HTTP client of the spider $guzzleClient = $spider->getRequestHandler()->getClient(); $guzzleClient->addSubscriber($logPlugin); $guzzleClient->addSubscriber($timerPlugin); $guzzleClient->addSubscriber($cachePlugin); // Set the user agent $guzzleClient->setUserAgent('PHP-Spider'); // Execute the crawl $result = $spider->crawl(); // Report $stats = $spider->getStatsHandler(); $spiderId = $stats->getSpiderId(); $queued = $stats->getQueued(); $filtered = $stats->getFiltered(); $failed = $stats->getFailed(); echo "\n\nSPIDER ID: " . $spiderId; echo "\n ENQUEUED: " . count($queued);