/** * crawl method * Create the crawler class object and set the options for crawling * @param type $u URL */ function crawl($u) { $C = new MyCrawler(); $C->setURL($u); $C->addContentTypeReceiveRule("#text/html#"); /* Only receive HTML pages */ $C->addURLFilterRule("#(jpg|gif|png|pdf|jpeg|svg|css|js)\$# i"); /* We don't want to crawl non HTML pages */ $C->setTrafficLimit(2000 * 1024); $C->obeyRobotsTxt(true); /* Should We follow robots.txt */ $C->go(); }
$sql = "UPDATE news SET num_like = '{$nLike}' , num_comment = '{$nCommand}' WHERE id='{$id}'"; if ($conn->query($sql) === TRUE) { echo "Record updated successfully: id = " . $id . "<br>"; } else { echo "Error updating record: id = " . $id . " " . $conn->error . "<br>"; } $conn->close(); } } function handleHeaderInfo(PHPCrawlerResponseHeader $header) { } } $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://vietnamnet.vn/vn/chinh-tri/285748/nhan-su-duoc-rut-hay-khong-do-dai-hoi-quyet-dinh.html"); $crawler->setCrawlingDepthLimit(0); $crawler->enableCookieHandling(true); $crawler->go(); $report = $crawler->getProcessReport(); //echo $report->; if (PHP_SAPI == "img") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb; echo "Bytes received: " . $report->bytes_received . " bytes" . $lb; echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
$sql = "UPDATE news SET num_like = '{$nLike}' WHERE id='{$id}'"; if ($conn->query($sql) === TRUE) { echo "Record updated successfully: id = " . $id . "<br>"; } else { echo "Error updating record: id = " . $id . " " . $conn->error . "<br>"; } $conn->close(); } } function handleHeaderInfo(PHPCrawlerResponseHeader $header) { } } $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://blogtamsu.vn/"); $crawler->setCrawlingDepthLimit(1); $crawler->enableCookieHandling(true); $crawler->go(); $report = $crawler->getProcessReport(); //echo $report->; if (PHP_SAPI == "img") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb; echo "Bytes received: " . $report->bytes_received . " bytes" . $lb; echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb; } else { echo "Content not received" . $lb; } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("localhost.p2.gta.charlie"); $crawler->obeyNoFollowTags(TRUE); $crawler->obeyRobotsTxt(TRUE); $crawler->enableAggressiveLinkSearch(FALSE); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png|css|js)([?].*)?\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short
// break; // } foreach ($codes as $code) { // Create new directory special for this faculty $directoryName = 'crawled/' . $year . '-' . $semester . '/' . $code; if (!file_exists($directoryName)) { mkdir($directoryName, 0755, true); } $crawler = new MyCrawler(); $crawler->code = $code; $crawler->file_count = 0; $crawler->directoryName = $directoryName; // URL to crawl $curriculum_year = $year < 2013 ? 2008 : 2013; $url = "https://six.akademik.itb.ac.id/publik/daftarkelas.php?ps=" . $code . "&semester=" . $semester . "&tahun=" . $year . "&th_kur=" . $curriculum_year; $crawler->setURL($url); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Don't let it back to the main page $res = $crawler->addURLFilterRule("#displayprodikelas.php# i"); // Thats enough, now here we go echo "Start crawling for year " . $year . " semester " . $semester; $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); echo "Summary for " . $code . ":" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb;
echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb; } else { echo "Content not received" . $lb; } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("www.php.net"); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") {
$sth->execute(); if ($sth->errorCode() != 0) { die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]); } // if we got here without dying, then we're good to go echo $url . " added\n\n"; } // just flush buffer so we can keep up the progress flush(); } } //--------------------------------------------------------------------------------------------- // Now, to the implementation. First, create the object. $crawler = new MyCrawler(); // Now, set the URL. Try to be specified about targeting, because sometimes the main page does some weird redirections. $crawler->setURL("http://markavip.com/ae/"); // Let's filter only HTML and the deep to 4 (enough for our example). $crawler->addReceiveContentType("#text/html#"); $crawler->setCrawlingDepthLimit(4); // We need a temp dir. Could be /tmp, but I like to keep things together. $crawler->setWorkingDirectory("./tmp/"); $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); // This is used to resume old crawlings. Since we're doing this in a loop, I don't recommend. /* $crawler->enableResumption(); if (!file_exists("./tmp/markavip_id.tmp")) { $crawler_ID = $crawler->getCrawlerId(); file_put_contents("./tmp/markavip_id.tmp", $crawler_ID); } else
if ($conn->query($sql) === TRUE) { echo "Record updated successfully: id = " . $id . "<br>"; } else { echo "Error updating record: id = " . $id . " " . $conn->error . "<br>"; } $conn->close(); } //echo "ssadasdsa"; //flush (); } function handleHeaderInfo(PHPCrawlerResponseHeader $header) { } } $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://vnexpress.net"); $crawler->setCrawlingDepthLimit(1); $crawler->go(); $report = $crawler->getProcessReport(); //echo $report->; if (PHP_SAPI == "img") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb; echo "Bytes received: " . $report->bytes_received . " bytes" . $lb; echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
$sth->execute(); if ($sth->errorCode() != 0) { die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]); } $sth = $dbh->prepare("INSERT INTO Products (IdMerchant,Name,Description,Price,URL,Image,QueryDocument) VALUES (:IdMerchant,:Name::text,:Description::text,:Price,:URL,:Image,to_tsvector(:Name::text) || to_tsvector(:Description::text))"); $sth->bindValue(":IdMerchant", self::IdMerchant); $sth->bindValue(":Name", $title); $sth->bindValue(":Description", $description); $sth->bindValue(":Price", $newprice); $sth->bindValue(":URL", $url); $sth->bindValue(":Image", $image); $sth->execute(); if ($sth->errorCode() != 0) { die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]); } echo $url . " added\n\n"; } flush(); } } $crawler = new MyCrawler(); $crawler->setURL("https://en-sa.namshi.com/"); $crawler->addReceiveContentType("#text/html#"); $crawler->setCrawlingDepthLimit(4); $crawler->setWorkingDirectory("./tmp/"); $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); while (true) { $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE); $report = $crawler->getProcessReport(); sleep(15 * 60); }
echo "Document Content: " . $DocInfo->content . $lb; } else { echo "Content not received" . $lb; } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://www.ttm-iitd.in/"); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); $crawler->setPageLimit(1); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport();
echo "Record updated successfully: id = " . $id . "<br>"; } else { echo "Error updating record: id = " . $id . " " . $conn->error . "<br>"; } $conn->close(); } //echo "ssadasdsa"; //flush (); } function handleHeaderInfo(PHPCrawlerResponseHeader $header) { } } $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://ngoisao.net/tin-tuc/thu-gian/an-choi/dia-chi-cuoi-tuan-hai-khong-gian-de-tron-o-sapa-3346068.html"); $crawler->setCrawlingDepthLimit(0); $crawler->enableCookieHandling(true); $crawler->go(); $report = $crawler->getProcessReport(); //echo $report->; if (PHP_SAPI == "img") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb; echo "Bytes received: " . $report->bytes_received . " bytes" . $lb; echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
{ // If the content-type of the document isn't "text/html" -> don't receive it. //if ($header->content_type != "text/html") //{ // return -1; //} //echo "$header->header_raw.<br>"; } } // Extend the class and override the handleDocumentInfo()-method // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("https://www.youtube.com"); $crawler->setCrawlingDepthLimit(0); // Only receive content of files with content-type "text/html" //$crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures //$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) // $crawler->setTrafficLimit(1000000 * 1024); // // echo "URL: ".$PageInfo->url."<br />"; //$PageInfo = new PHPCrawlerDocumentInfo(); //$crawler->handleDocumentInfo($PageInfo); //$crawler->handleDocumentInfo($PageInfo);
echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb; } else { echo "Content not received" . $lb; } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://rentpad.com.ph/long-term-rentals/cebu/apartment"); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") {
// if (!(stripos($DocInfo->url, 'http://www.thework.com') === FALSE)) { //echo $DocInfo->url . "," . $DocInfo->http_status_code . "," . $DocInfo->referer_url . $lb; echo $DocInfo->http_status_code . "," . $DocInfo->referer_url . "," . $DocInfo->url . $lb; // } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example //echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("dev.thework.com"); // Only receive content of files with content-type "text/html" //$crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures //$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) //$crawler->setTrafficLimit(1000 * 1024); //$crawler->setFollowMode(1); //$crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); if (PHP_SAPI == "cli") { $lb = "\n"; } else { $lb = "<br />";
$sql = "UPDATE news SET num_like = '{$nLike}' , num_share = '{$nShare}' WHERE id='{$id}'"; if ($conn->query($sql) === TRUE) { echo "Record updated successfully: id = " . $id . "<br>"; } else { echo "Error updating record: id = " . $id . " " . $conn->error . "<br>"; } $conn->close(); } } function handleHeaderInfo(PHPCrawlerResponseHeader $header) { } } $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://www.24h.com.vn/am-thuc/cach-muoi-dua-cai-ngon-gion-vang-uom-c460a765028.html"); $crawler->setCrawlingDepthLimit(0); $crawler->enableCookieHandling(true); $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); //echo $report->; if (PHP_SAPI == "img") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb;
if ($sth->errorCode() != 0) { die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]); } $sth = $dbh->prepare("INSERT INTO Products (IdMerchant,Name,Description,OldPrice,Price,URL,Image,QueryDocument) VALUES (:IdMerchant,:Name::text,:Description::text,:OldPrice,:Price,:URL,:Image,to_tsvector(:Name::text) || to_tsvector(:Description::text))"); $sth->bindValue(":IdMerchant", self::IdMerchant); $sth->bindValue(":Name", $title); $sth->bindValue(":Description", $description); $sth->bindValue(":OldPrice", $oldprice); $sth->bindValue(":Price", $newprice); $sth->bindValue(":URL", $url); $sth->bindValue(":Image", $image); $sth->execute(); if ($sth->errorCode() != 0) { die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]); } echo $URL . " added\n\n"; } flush(); } } $crawler = new MyCrawler(); $crawler->setURL("http://wysada.com/en/"); $crawler->addReceiveContentType("#text/html#"); $crawler->setCrawlingDepthLimit(4); $crawler->setWorkingDirectory("./tmp/"); $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); while (true) { $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE); $report = $crawler->getProcessReport(); sleep(15 * 60); }
{ // If the content-type of the document isn't "text/html" -> don't receive it. //if ($header->content_type != "text/html") //{ // return -1; //} //echo "$header->header_raw.<br>"; } } // Extend the class and override the handleDocumentInfo()-method // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://www.doisongphapluat.com/tin-tuc/tin-trong-nuoc/dai-hoi-cua-doan-ket-dan-chu-ky-cuong-doi-moi-a129965.html"); $crawler->setCrawlingDepthLimit(0); // Only receive content of files with content-type "text/html" //$crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures //$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) // $crawler->setTrafficLimit(1000000 * 1024); // // echo "URL: ".$PageInfo->url."<br />"; //$PageInfo = new PHPCrawlerDocumentInfo(); //$crawler->handleDocumentInfo($PageInfo); //$crawler->handleDocumentInfo($PageInfo);
} // end if } else { echo 'skip these pages <br>'; } } $html->clear(); unset($html); } flush(); } } // Bring the crawler out $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://harcourts.co.nz/"); // Crawl only URL's with the word property in them $crawler->addURLFollowRule("#property# i"); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 10mb $crawler->setTrafficLimit(1000 * 104857600); // Start crawler $crawler->go(); // At the end, after the process is finished print report $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") {
echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb; } else { echo "Content not received" . $lb; } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("www.sysfoghost.in"); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") {
echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb; } else { echo "Content not received" . $lb; } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl (the entry-page of the mysql-documentation on php.net) $crawler->setURL("http://www.php.net/manual/en/book.mysql.php"); // Only receive content of documents with content-type "text/html" $crawler->addReceiveContentType("#text/html#"); // Ignore links to pictures, css-documents etc (prefilter) $crawler->addURLFilterRule("#\\.(jpg|gif|png|pdf|jpeg|css|js)\$# i"); // Every URL within the mysql-documentation looks like // "http://www.php.net/manual/en/function.mysql-affected-rows.php" // or "http://www.php.net/manual/en/mysql.setup.php", they all contain // "http://www.php.net/manual/en/" followed by "mysql" somewhere. // So we add a corresponding follow-rule to the crawler. $crawler->addURLFollowRule("#^http://www.php.net/manual/en/.*mysql[^a-z]# i"); // That's it, start crawling using 5 processes $crawler->goMultiProcessed(5); // At the end, after the process is finished, we print a short // report (see method getReport() for more information) $report = $crawler->getProcessReport();
$sql = "UPDATE news SET num_like = '{$nLike}' WHERE id='{$id}'"; if ($conn->query($sql) === TRUE) { echo "Record updated successfully: id = " . $id . "<br>"; } else { echo "Error updating record: id = " . $id . " " . $conn->error . "<br>"; } $conn->close(); } } function handleHeaderInfo(PHPCrawlerResponseHeader $header) { } } $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://dantri.com.vn/the-gioi/my-phan-no-vi-video-thuy-thu-quy-goi-truoc-binh-sy-iran-20160119082404452.htm"); $crawler->setCrawlingDepthLimit(0); $crawler->enableCookieHandling(true); $crawler->go(); $report = $crawler->getProcessReport(); //echo $report->; if (PHP_SAPI == "img") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb; echo "Bytes received: " . $report->bytes_received . " bytes" . $lb; echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
$sth->execute(); if ($sth->errorCode() != 0) { die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]); } $sth = $dbh->prepare("INSERT INTO Products (IdMerchant,Name,Description,Price,URL,Image,QueryDocument) VALUES (:IdMerchant,:Name::text,:Description::text,:Price,:URL,:Image,to_tsvector(:Name::text) || to_tsvector(:Description::text))"); $sth->bindValue(":IdMerchant", self::IdMerchant); $sth->bindValue(":Name", $title); $sth->bindValue(":Description", $description); $sth->bindValue(":Price", $newprice); $sth->bindValue(":URL", $url); $sth->bindValue(":Image", $image); $sth->execute(); if ($sth->errorCode() != 0) { die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]); } echo $URL . " added\n\n"; } flush(); } } $crawler = new MyCrawler(); $crawler->setURL("http://uae.souq.com/ae-en/"); $crawler->addReceiveContentType("#text/html#"); $crawler->setCrawlingDepthLimit(4); $crawler->setWorkingDirectory("./tmp/"); $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); while (true) { $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE); $report = $crawler->getProcessReport(); sleep(15 * 60); }
echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb; } else { echo "Content not received" . $lb; } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://www.divxturka.net/"); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") {