コード例 #1
0
            if (!$this->url_exists($link)) {
                unset($this->links[$ldx]);
                continue;
            }
            $this->links[$ldx] = $link;
        }
        // Final re-order
        $this->links = array_values($this->links);
        return $this->links;
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler($_SESSION['crawler']['domain']);
$crawler->setFollowMode(2);
$crawler->addContentTypeReceiveRule("#text/html#");
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
$crawler->enableCookieHandling(true);
if ($_SESSION['crawler']['respect_robots_txt'] == true) {
    $crawler->obeyRobotsTxt(true, $_SESSION['crawler']['domain'] . '/robots.txt');
    $crawler->obeyNoFollowTags(true);
}
$crawler->enableAggressiveLinkSearch(false);
$crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::ALL_SPECIAL_SECTIONS);
$crawler->addLinkSearchContentType("#text/html# i");
$crawler->setLinkExtractionTags(array('href'));
$crawler->setUserAgentString('Crawl_Scrape_Solr_Index/1.0)');
// no data on poage yet
if ($_SESSION['crawler']['auth'] == true) {
    $crawler->set_url_test_auth($_SESSION['crawler']['user'], $_SESSION['crawler']['pass']);
コード例 #2
0
        if ($DocInfo->received == true) {
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
            echo "Document Content: " . $DocInfo->source . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        echo $lb;
        flush();
    }
}
$url = "www.flipkart.com/search/a/all?fk-search=all&query=" . $_GET[query];
$crawler = new MyCrawler();
$crawler->setURL($url);
$crawler->addContentTypeReceiveRule("#text/html#");
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
$crawler->enableCookieHandling(true);
$crawler->setTrafficLimit(1000 * 1024);
$crawler->setPageLimit(5);
$crawler->setFollowMode(3);
$crawler->go();
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
echo "Process runtime: " . $report->process_runtime . " sec" . $lb;