} // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("localhost.p2.gta.charlie"); $crawler->obeyNoFollowTags(TRUE); $crawler->obeyRobotsTxt(TRUE); $crawler->enableAggressiveLinkSearch(FALSE); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png|css|js)([?].*)?\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") {
return $this->links; } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler($_SESSION['crawler']['domain']); $crawler->setFollowMode(2); $crawler->addContentTypeReceiveRule("#text/html#"); $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); $crawler->enableCookieHandling(true); if ($_SESSION['crawler']['respect_robots_txt'] == true) { $crawler->obeyRobotsTxt(true, $_SESSION['crawler']['domain'] . '/robots.txt'); $crawler->obeyNoFollowTags(true); } $crawler->enableAggressiveLinkSearch(false); $crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::ALL_SPECIAL_SECTIONS); $crawler->addLinkSearchContentType("#text/html# i"); $crawler->setLinkExtractionTags(array('href')); $crawler->setUserAgentString('Crawl_Scrape_Solr_Index/1.0)'); // no data on poage yet if ($_SESSION['crawler']['auth'] == true) { $crawler->set_url_test_auth($_SESSION['crawler']['user'], $_SESSION['crawler']['pass']); $pattern = "/https?://" . str_replace('.', '\\.', $_SESSION['crawler']['silo']) . "/is"; $crawler->addBasicAuthentication($pattern, $_SESSION['crawler']['user'], $_SESSION['crawler']['pass']); } // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport();