<?php use VDB\Spider\Discoverer\XPathExpressionDiscoverer; use VDB\Spider\Spider; require_once __DIR__ . '/../vendor/autoload.php'; // Create Spider $spider = new Spider('http://www.dmoz.org'); // Add a URI discoverer. Without it, the spider does nothing. In this case, we want <a> tags from a certain <div> $spider->addDiscoverer(new XPathExpressionDiscoverer("//div[@id='catalogs']//a")); // Set some sane options for this example. In this case, we only get the first 10 items from the start page. $spider->setMaxDepth(1); $spider->setMaxQueueSize(10); // Execute crawl $spider->crawl(); // Report $stats = $spider->getStatsHandler(); echo "\nSPIDER ID: " . $stats->getSpiderId(); echo "\n ENQUEUED: " . count($stats->getQueued()); echo "\n SKIPPED: " . count($stats->getFiltered()); echo "\n FAILED: " . count($stats->getFailed()); // Finally we could do some processing on the downloaded resources // In this example, we will echo the title of all resources echo "\n\nDOWNLOADED RESOURCES: "; foreach ($spider->getPersistenceHandler() as $resource) { echo "\n - " . $resource->getCrawler()->filterXpath('//title')->text(); }
// Execute the crawl $result = $spider->crawl(); // Report $stats = $spider->getStatsHandler(); $spiderId = $stats->getSpiderId(); $queued = $stats->getQueued(); $filtered = $stats->getFiltered(); $failed = $stats->getFailed(); echo "\n\nSPIDER ID: " . $spiderId; echo "\n ENQUEUED: " . count($queued); echo "\n SKIPPED: " . count($filtered); echo "\n FAILED: " . count($failed); // With the information from some of plugins and listeners, we can determine some metrics $peakMem = round(memory_get_peak_usage(true) / 1024 / 1024, 2); $totalTime = round(microtime(true) - $start, 2); $totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2); echo "\n\nMETRICS:"; echo "\n PEAK MEM USAGE: " . $peakMem . 'MB'; echo "\n TOTAL TIME: " . $totalTime . 's'; echo "\n REQUEST TIME: " . $timerPlugin->getTotal() . 's'; echo "\n POLITENESS WAIT TIME: " . $totalDelay . 's'; echo "\n PROCESSING TIME: " . ($totalTime - $timerPlugin->getTotal() - $totalDelay) . 's'; // Finally we could start some processing on the downloaded resources echo "\n\nDOWNLOADED RESOURCES: "; $downloaded = $spider->getPersistenceHandler(); foreach ($downloaded as $resource) { $title = $resource->getCrawler()->filterXpath('//title')->text(); $contentLength = $resource->getResponse()->getHeader('Content-Length', true); // do something with the data echo "\n - " . str_pad("[" . round($contentLength / 1024), 4, ' ', STR_PAD_LEFT) . "KB] {$title}"; }