$crawler = new MyCrawler(); $crawler->setURL("www.php.net"); $crawler->addContentTypeReceiveRule("#text/html#"); $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); $crawler->setPageLimit(50); // Set the page-limit to 50 for testing // Important for resumable scripts/processes! $crawler->enableResumption(); // At the firts start of the script retreive the crawler-ID and store it // (in a temporary file in this example) if (!file_exists("/tmp/mycrawlerid_for_php.net.tmp")) { $crawler_ID = $crawler->getCrawlerId(); file_put_contents("/tmp/mycrawlerid_for_php.net.tmp", $crawler_ID); } else { $crawler_ID = file_get_contents("/tmp/mycrawlerid_for_php.net.tmp"); $crawler->resume($crawler_ID); } // Start crawling $crawler->goMultiProcessed(5); // Delete the stored crawler-ID after the process is finished completely and successfully. unlink("/tmp/mycrawlerid_for_php.net.tmp"); $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb; echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;