/** * crawl method * Create the crawler class object and set the options for crawling * @param type $u URL */ function crawl($u) { $C = new MyCrawler(); $C->setURL($u); $C->addContentTypeReceiveRule("#text/html#"); /* Only receive HTML pages */ $C->addURLFilterRule("#(jpg|gif|png|pdf|jpeg|svg|css|js)\$# i"); /* We don't want to crawl non HTML pages */ $C->setTrafficLimit(2000 * 1024); $C->obeyRobotsTxt(true); /* Should We follow robots.txt */ $C->go(); }
// Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("www.php.net"); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") { $lb = "\n"; } else { $lb = "<br />"; }
// Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl (the entry-page of the mysql-documentation on php.net) $crawler->setURL("http://www.php.net/manual/en/book.mysql.php"); // Only receive content of documents with content-type "text/html" $crawler->addReceiveContentType("#text/html#"); // Ignore links to pictures, css-documents etc (prefilter) $crawler->addURLFilterRule("#\\.(jpg|gif|png|pdf|jpeg|css|js)\$# i"); // Every URL within the mysql-documentation looks like // "http://www.php.net/manual/en/function.mysql-affected-rows.php" // or "http://www.php.net/manual/en/mysql.setup.php", they all contain // "http://www.php.net/manual/en/" followed by "mysql" somewhere. // So we add a corresponding follow-rule to the crawler. $crawler->addURLFollowRule("#^http://www.php.net/manual/en/.*mysql[^a-z]# i"); // That's it, start crawling using 5 processes $crawler->goMultiProcessed(5); // At the end, after the process is finished, we print a short // report (see method getReport() for more information) $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") { $lb = "\n"; } else { $lb = "<br />";
$directoryName = 'crawled/' . $year . '-' . $semester . '/' . $code; if (!file_exists($directoryName)) { mkdir($directoryName, 0755, true); } $crawler = new MyCrawler(); $crawler->code = $code; $crawler->file_count = 0; $crawler->directoryName = $directoryName; // URL to crawl $curriculum_year = $year < 2013 ? 2008 : 2013; $url = "https://six.akademik.itb.ac.id/publik/daftarkelas.php?ps=" . $code . "&semester=" . $semester . "&tahun=" . $year . "&th_kur=" . $curriculum_year; $crawler->setURL($url); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Don't let it back to the main page $res = $crawler->addURLFilterRule("#displayprodikelas.php# i"); // Thats enough, now here we go echo "Start crawling for year " . $year . " semester " . $semester; $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); echo "Summary for " . $code . ":" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb; echo "Bytes received: " . $report->bytes_received . " bytes" . $lb; echo "Process runtime: " . $report->process_runtime . " sec" . $lb; } }
flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("localhost.p2.gta.charlie"); $crawler->obeyNoFollowTags(TRUE); $crawler->obeyRobotsTxt(TRUE); $crawler->enableAggressiveLinkSearch(FALSE); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png|css|js)([?].*)?\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") { $lb = "\n"; } else { $lb = "<br />"; }