Exemplo n.º 1
0
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
$crawler->enableCookieHandling(true);
if ($_SESSION['crawler']['respect_robots_txt'] == true) {
    $crawler->obeyRobotsTxt(true, $_SESSION['crawler']['domain'] . '/robots.txt');
    $crawler->obeyNoFollowTags(true);
}
$crawler->enableAggressiveLinkSearch(false);
$crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::ALL_SPECIAL_SECTIONS);
$crawler->addLinkSearchContentType("#text/html# i");
$crawler->setLinkExtractionTags(array('href'));
$crawler->setUserAgentString('Crawl_Scrape_Solr_Index/1.0)');
// no data on poage yet
if ($_SESSION['crawler']['auth'] == true) {
    $crawler->set_url_test_auth($_SESSION['crawler']['user'], $_SESSION['crawler']['pass']);
    $pattern = "/https?://" . str_replace('.', '\\.', $_SESSION['crawler']['silo']) . "/is";
    $crawler->addBasicAuthentication($pattern, $_SESSION['crawler']['user'], $_SESSION['crawler']['pass']);
}
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
$links = $crawler->processLinks($_SESSION['crawler']['domain'], $_SESSION['crawler']['respect_robots_txt']);
//$lb     = "<br />";
//echo "Summary:" . $lb;
//echo "Links followed: " . $report->links_followed . $lb;
//echo "Links extracted: " . count($links) . $lb;
//echo "Documents received: " . $report->files_received . $lb;
//echo "Bytes received: " . $report->bytes_received . " bytes." . $lb;
//echo "Spider Process runtime: " . round($report->process_runtime, 2) . " seconds." . $lb . $lb;
if (count($links) > 0) {