$sth->execute(); if ($sth->errorCode() != 0) { die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]); } $sth = $dbh->prepare("INSERT INTO Products (IdMerchant,Name,Description,Price,URL,Image,QueryDocument) VALUES (:IdMerchant,:Name::text,:Description::text,:Price,:URL,:Image,to_tsvector(:Name::text) || to_tsvector(:Description::text))"); $sth->bindValue(":IdMerchant", self::IdMerchant); $sth->bindValue(":Name", $title); $sth->bindValue(":Description", $description); $sth->bindValue(":Price", $newprice); $sth->bindValue(":URL", $url); $sth->bindValue(":Image", $image); $sth->execute(); if ($sth->errorCode() != 0) { die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]); } echo $URL . " added\n\n"; } flush(); } } $crawler = new MyCrawler(); $crawler->setURL("http://uae.souq.com/ae-en/"); $crawler->addReceiveContentType("#text/html#"); $crawler->setCrawlingDepthLimit(4); $crawler->setWorkingDirectory("./tmp/"); $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); while (true) { $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE); $report = $crawler->getProcessReport(); sleep(15 * 60); }
$crawler->setURL("www.php.net"); $crawler->addContentTypeReceiveRule("#text/html#"); $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); $crawler->setPageLimit(50); // Set the page-limit to 50 for testing // Important for resumable scripts/processes! $crawler->enableResumption(); // At the firts start of the script retreive the crawler-ID and store it // (in a temporary file in this example) if (!file_exists("/tmp/mycrawlerid_for_php.net.tmp")) { $crawler_ID = $crawler->getCrawlerId(); file_put_contents("/tmp/mycrawlerid_for_php.net.tmp", $crawler_ID); } else { $crawler_ID = file_get_contents("/tmp/mycrawlerid_for_php.net.tmp"); $crawler->resume($crawler_ID); } // Start crawling $crawler->goMultiProcessed(5); // Delete the stored crawler-ID after the process is finished completely and successfully. unlink("/tmp/mycrawlerid_for_php.net.tmp"); $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb; echo "Bytes received: " . $report->bytes_received . " bytes" . $lb; echo "Process runtime: " . $report->process_runtime . " sec" . $lb;