$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); $crawler->enableCookieHandling(true); if ($_SESSION['crawler']['respect_robots_txt'] == true) { $crawler->obeyRobotsTxt(true, $_SESSION['crawler']['domain'] . '/robots.txt'); $crawler->obeyNoFollowTags(true); } $crawler->enableAggressiveLinkSearch(false); $crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::ALL_SPECIAL_SECTIONS); $crawler->addLinkSearchContentType("#text/html# i"); $crawler->setLinkExtractionTags(array('href')); $crawler->setUserAgentString('Crawl_Scrape_Solr_Index/1.0)'); // no data on poage yet if ($_SESSION['crawler']['auth'] == true) { $crawler->set_url_test_auth($_SESSION['crawler']['user'], $_SESSION['crawler']['pass']); $pattern = "/https?://" . str_replace('.', '\\.', $_SESSION['crawler']['silo']) . "/is"; $crawler->addBasicAuthentication($pattern, $_SESSION['crawler']['user'], $_SESSION['crawler']['pass']); } // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); $links = $crawler->processLinks($_SESSION['crawler']['domain'], $_SESSION['crawler']['respect_robots_txt']); //$lb = "<br />"; //echo "Summary:" . $lb; //echo "Links followed: " . $report->links_followed . $lb; //echo "Links extracted: " . count($links) . $lb; //echo "Documents received: " . $report->files_received . $lb; //echo "Bytes received: " . $report->bytes_received . " bytes." . $lb; //echo "Spider Process runtime: " . round($report->process_runtime, 2) . " seconds." . $lb . $lb; if (count($links) > 0) {