// Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://www.ttm-iitd.in/"); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); $crawler->setPageLimit(1); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb; echo "Bytes received: " . $report->bytes_received . " bytes" . $lb; echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>"). if (PHP_SAPI == "cli") { $lb = "\n"; } else { $lb = "<br />"; } // Print the URL and the HTTP-status-Code echo "Page requested: " . $DocInfo->url . " (" . $DocInfo->http_status_code . ")" . $lb; flush(); } } $crawler = new MyCrawler(); $crawler->setURL("www.php.net"); $crawler->addContentTypeReceiveRule("#text/html#"); $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); $crawler->setPageLimit(50); // Set the page-limit to 50 for testing // Important for resumable scripts/processes! $crawler->enableResumption(); // At the firts start of the script retreive the crawler-ID and store it // (in a temporary file in this example) if (!file_exists("/tmp/mycrawlerid_for_php.net.tmp")) { $crawler_ID = $crawler->getCrawlerId(); file_put_contents("/tmp/mycrawlerid_for_php.net.tmp", $crawler_ID); } else { $crawler_ID = file_get_contents("/tmp/mycrawlerid_for_php.net.tmp"); $crawler->resume($crawler_ID); } // Start crawling $crawler->goMultiProcessed(5); // Delete the stored crawler-ID after the process is finished completely and successfully.
// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>"). if (PHP_SAPI == "cli") { $lb = "\n"; } else { $lb = "<br />"; } // Print the URL and the HTTP-status-Code echo "Page requested: " . $DocInfo->url . " (" . $DocInfo->http_status_code . ")" . $lb; flush(); } } $crawler = new MyCrawler(); $crawler->setURL("localhost.p2.gta.charlie"); $crawler->addContentTypeReceiveRule("#text/html#"); $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png|css|js|ico|xml)([?].*)?\$# i"); $crawler->setPageLimit(3); // Set the page-limit to 50 for testing // Important for resumable scripts/processes! $crawler->enableResumption(); // At the firts start of the script retreive the crawler-ID and store it // (in a temporary file in this example) if (!file_exists("/tmp/mycrawlerid_for_php.net.tmp")) { $crawler_ID = $crawler->getCrawlerId(); file_put_contents("/tmp/mycrawlerid_for_php.net.tmp", $crawler_ID); } else { $crawler_ID = file_get_contents("/tmp/mycrawlerid_for_php.net.tmp"); $crawler->resume($crawler_ID); } // Start crawling $crawler->goMultiProcessed(5); // Delete the stored crawler-ID after the process is finished completely and successfully.