Beispiel #1
0
            $sth->execute();
            if ($sth->errorCode() != 0) {
                die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]);
            }
            $sth = $dbh->prepare("INSERT INTO Products (IdMerchant,Name,Description,Price,URL,Image,QueryDocument) VALUES (:IdMerchant,:Name::text,:Description::text,:Price,:URL,:Image,to_tsvector(:Name::text) || to_tsvector(:Description::text))");
            $sth->bindValue(":IdMerchant", self::IdMerchant);
            $sth->bindValue(":Name", $title);
            $sth->bindValue(":Description", $description);
            $sth->bindValue(":Price", $newprice);
            $sth->bindValue(":URL", $url);
            $sth->bindValue(":Image", $image);
            $sth->execute();
            if ($sth->errorCode() != 0) {
                die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]);
            }
            echo $URL . " added\n\n";
        }
        flush();
    }
}
$crawler = new MyCrawler();
$crawler->setURL("http://uae.souq.com/ae-en/");
$crawler->addReceiveContentType("#text/html#");
$crawler->setCrawlingDepthLimit(4);
$crawler->setWorkingDirectory("./tmp/");
$crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE);
while (true) {
    $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE);
    $report = $crawler->getProcessReport();
    sleep(15 * 60);
}
Beispiel #2
0
$crawler->setURL("www.php.net");
$crawler->addContentTypeReceiveRule("#text/html#");
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
$crawler->setPageLimit(50);
// Set the page-limit to 50 for testing
// Important for resumable scripts/processes!
$crawler->enableResumption();
// At the firts start of the script retreive the crawler-ID and store it
// (in a temporary file in this example)
if (!file_exists("/tmp/mycrawlerid_for_php.net.tmp")) {
    $crawler_ID = $crawler->getCrawlerId();
    file_put_contents("/tmp/mycrawlerid_for_php.net.tmp", $crawler_ID);
} else {
    $crawler_ID = file_get_contents("/tmp/mycrawlerid_for_php.net.tmp");
    $crawler->resume($crawler_ID);
}
// Start crawling
$crawler->goMultiProcessed(5);
// Delete the stored crawler-ID after the process is finished completely and successfully.
unlink("/tmp/mycrawlerid_for_php.net.tmp");
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
echo "Process runtime: " . $report->process_runtime . " sec" . $lb;