if (!$this->url_exists($link)) { unset($this->links[$ldx]); continue; } $this->links[$ldx] = $link; } // Final re-order $this->links = array_values($this->links); return $this->links; } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler($_SESSION['crawler']['domain']); $crawler->setFollowMode(2); $crawler->addContentTypeReceiveRule("#text/html#"); $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); $crawler->enableCookieHandling(true); if ($_SESSION['crawler']['respect_robots_txt'] == true) { $crawler->obeyRobotsTxt(true, $_SESSION['crawler']['domain'] . '/robots.txt'); $crawler->obeyNoFollowTags(true); } $crawler->enableAggressiveLinkSearch(false); $crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::ALL_SPECIAL_SECTIONS); $crawler->addLinkSearchContentType("#text/html# i"); $crawler->setLinkExtractionTags(array('href')); $crawler->setUserAgentString('Crawl_Scrape_Solr_Index/1.0)'); // no data on poage yet if ($_SESSION['crawler']['auth'] == true) { $crawler->set_url_test_auth($_SESSION['crawler']['user'], $_SESSION['crawler']['pass']);
if ($DocInfo->received == true) { echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb; echo "Document Content: " . $DocInfo->source . $lb; } else { echo "Content not received" . $lb; } echo $lb; flush(); } } $url = "www.flipkart.com/search/a/all?fk-search=all&query=" . $_GET[query]; $crawler = new MyCrawler(); $crawler->setURL($url); $crawler->addContentTypeReceiveRule("#text/html#"); $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); $crawler->enableCookieHandling(true); $crawler->setTrafficLimit(1000 * 1024); $crawler->setPageLimit(5); $crawler->setFollowMode(3); $crawler->go(); $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb; echo "Bytes received: " . $report->bytes_received . " bytes" . $lb; echo "Process runtime: " . $report->process_runtime . " sec" . $lb;