/** * crawl method * Create the crawler class object and set the options for crawling * @param type $u URL */ function crawl($u) { $C = new MyCrawler(); $C->setURL($u); $C->addContentTypeReceiveRule("#text/html#"); /* Only receive HTML pages */ $C->addURLFilterRule("#(jpg|gif|png|pdf|jpeg|svg|css|js)\$# i"); /* We don't want to crawl non HTML pages */ $C->setTrafficLimit(2000 * 1024); $C->obeyRobotsTxt(true); /* Should We follow robots.txt */ $C->go(); }
*/ // if ($DocInfo->http_status_code != "200") { // if (!(stripos($DocInfo->url, 'http://www.thework.com') === FALSE)) { //echo $DocInfo->url . "," . $DocInfo->http_status_code . "," . $DocInfo->referer_url . $lb; echo $DocInfo->http_status_code . "," . $DocInfo->referer_url . "," . $DocInfo->url . $lb; // } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example //echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("dev.thework.com"); // Only receive content of files with content-type "text/html" //$crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures //$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) //$crawler->setTrafficLimit(1000 * 1024); //$crawler->setFollowMode(1); //$crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); if (PHP_SAPI == "cli") { $lb = "\n";
unlink($HSource); } $sql = "UPDATE news SET num_like = '{$nLike}' , num_comment = '{$nCommand}' WHERE id='{$id}'"; if ($conn->query($sql) === TRUE) { echo "Record updated successfully: id = " . $id . "<br>"; } else { echo "Error updating record: id = " . $id . " " . $conn->error . "<br>"; } $conn->close(); } } function handleHeaderInfo(PHPCrawlerResponseHeader $header) { } } $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://vietnamnet.vn/vn/chinh-tri/285748/nhan-su-duoc-rut-hay-khong-do-dai-hoi-quyet-dinh.html"); $crawler->setCrawlingDepthLimit(0); $crawler->enableCookieHandling(true); $crawler->go(); $report = $crawler->getProcessReport(); //echo $report->; if (PHP_SAPI == "img") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb;
unlink($HSource); } $sql = "UPDATE news SET num_like = '{$nLike}' WHERE id='{$id}'"; if ($conn->query($sql) === TRUE) { echo "Record updated successfully: id = " . $id . "<br>"; } else { echo "Error updating record: id = " . $id . " " . $conn->error . "<br>"; } $conn->close(); } } function handleHeaderInfo(PHPCrawlerResponseHeader $header) { } } $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://blogtamsu.vn/"); $crawler->setCrawlingDepthLimit(1); $crawler->enableCookieHandling(true); $crawler->go(); $report = $crawler->getProcessReport(); //echo $report->; if (PHP_SAPI == "img") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb;
class MyCrawler extends PHPCrawler { function handleDocumentInfo($DocInfo) { // Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>"). if (PHP_SAPI == "cli") { $lb = "\n"; } else { $lb = "<br />"; } // Print the URL and the HTTP-status-Code echo "Page requested: " . $DocInfo->url . " (" . $DocInfo->http_status_code . ")" . $lb; flush(); } } $crawler = new MyCrawler(); $crawler->setURL("localhost.p2.gta.charlie"); $crawler->addContentTypeReceiveRule("#text/html#"); $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png|css|js|ico|xml)([?].*)?\$# i"); $crawler->setPageLimit(3); // Set the page-limit to 50 for testing // Important for resumable scripts/processes! $crawler->enableResumption(); // At the firts start of the script retreive the crawler-ID and store it // (in a temporary file in this example) if (!file_exists("/tmp/mycrawlerid_for_php.net.tmp")) { $crawler_ID = $crawler->getCrawlerId(); file_put_contents("/tmp/mycrawlerid_for_php.net.tmp", $crawler_ID); } else { $crawler_ID = file_get_contents("/tmp/mycrawlerid_for_php.net.tmp"); $crawler->resume($crawler_ID);
// Print if the content of the document was be recieved or not if ($DocInfo->received == true) { echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb; } else { echo "Content not received" . $lb; } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("www.php.net"); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information)
// Print if the content of the document was be recieved or not if ($DocInfo->received == true) { echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb; } else { echo "Content not received" . $lb; } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl (the entry-page of the mysql-documentation on php.net) $crawler->setURL("http://www.php.net/manual/en/book.mysql.php"); // Only receive content of documents with content-type "text/html" $crawler->addReceiveContentType("#text/html#"); // Ignore links to pictures, css-documents etc (prefilter) $crawler->addURLFilterRule("#\\.(jpg|gif|png|pdf|jpeg|css|js)\$# i"); // Every URL within the mysql-documentation looks like // "http://www.php.net/manual/en/function.mysql-affected-rows.php" // or "http://www.php.net/manual/en/mysql.setup.php", they all contain // "http://www.php.net/manual/en/" followed by "mysql" somewhere. // So we add a corresponding follow-rule to the crawler. $crawler->addURLFollowRule("#^http://www.php.net/manual/en/.*mysql[^a-z]# i"); // That's it, start crawling using 5 processes $crawler->goMultiProcessed(5); // At the end, after the process is finished, we print a short
// Insert the data in the database $conn->query($sql); } // end if } else { echo 'skip these pages <br>'; } } $html->clear(); unset($html); } flush(); } } // Bring the crawler out $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://harcourts.co.nz/"); // Crawl only URL's with the word property in them $crawler->addURLFollowRule("#property# i"); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 10mb $crawler->setTrafficLimit(1000 * 104857600); // Start crawler $crawler->go(); // At the end, after the process is finished print report
} if (!$this->url_exists($link)) { unset($this->links[$ldx]); continue; } $this->links[$ldx] = $link; } // Final re-order $this->links = array_values($this->links); return $this->links; } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler($_SESSION['crawler']['domain']); $crawler->setFollowMode(2); $crawler->addContentTypeReceiveRule("#text/html#"); $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); $crawler->enableCookieHandling(true); if ($_SESSION['crawler']['respect_robots_txt'] == true) { $crawler->obeyRobotsTxt(true, $_SESSION['crawler']['domain'] . '/robots.txt'); $crawler->obeyNoFollowTags(true); } $crawler->enableAggressiveLinkSearch(false); $crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::ALL_SPECIAL_SECTIONS); $crawler->addLinkSearchContentType("#text/html# i"); $crawler->setLinkExtractionTags(array('href')); $crawler->setUserAgentString('Crawl_Scrape_Solr_Index/1.0)'); // no data on poage yet if ($_SESSION['crawler']['auth'] == true) {
$sql = "UPDATE news SET num_like = '{$nLike}' , num_comment = '{$nCommand}' WHERE id='{$id}'"; if ($conn->query($sql) === TRUE) { echo "Record updated successfully: id = " . $id . "<br>"; } else { echo "Error updating record: id = " . $id . " " . $conn->error . "<br>"; } $conn->close(); } //echo "ssadasdsa"; //flush (); } function handleHeaderInfo(PHPCrawlerResponseHeader $header) { } } $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://ngoisao.net/tin-tuc/thu-gian/an-choi/dia-chi-cuoi-tuan-hai-khong-gian-de-tron-o-sapa-3346068.html"); $crawler->setCrawlingDepthLimit(0); $crawler->enableCookieHandling(true); $crawler->go(); $report = $crawler->getProcessReport(); //echo $report->; if (PHP_SAPI == "img") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb;
// Print if the content of the document was be recieved or not if ($DocInfo->received == true) { echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb; } else { echo "Content not received" . $lb; } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl (the entry-page of the mysql-documentation on php.net) $url = "https://six.akademik.itb.ac.id/publik/displayprodikelas.php?semester=1&tahun=2014&th_kur=2013"; $crawler->setURL($url); // Only receive content of documents with content-type "text/html" //$crawler->addReceiveContentType("#text/html#"); $crawler->addContentTypeReceiveRule("#text/html#"); $crawler->addContentTypeReceiveRule("#text/plain#"); // Ignore links to pictures, css-documents etc (prefilter) $crawler->addURLFilterRule("#\\.(jpg|gif|png|pdf|jpeg|css|js)\$# i"); // Every URL within the mysql-documentation looks like // "http://php.net/manual/en/function.mysql-affected-rows.php" // or "http://php.net/manual/en/mysql.setup.php", they all contain // "http://php.net/manual/en/" followed by "mysql" somewhere. // So we add a corresponding follow-rule to the crawler. //$crawler->addURLFollowRule("#^http://php.net/manual/en/.*mysql[^a-z]# i");
} function handleHeaderInfo(PHPCrawlerResponseHeader $header) { // If the content-type of the document isn't "text/html" -> don't receive it. //if ($header->content_type != "text/html") //{ // return -1; //} //echo "$header->header_raw.<br>"; } } // Extend the class and override the handleDocumentInfo()-method // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("https://www.youtube.com"); $crawler->setCrawlingDepthLimit(0); // Only receive content of files with content-type "text/html" //$crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures //$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) // $crawler->setTrafficLimit(1000000 * 1024); // // echo "URL: ".$PageInfo->url."<br />"; //$PageInfo = new PHPCrawlerDocumentInfo();
{ function handleDocumentInfo($DocInfo) { if ($DocInfo->http_status_code == 200) { $info = pathinfo($DocInfo->url, PATHINFO_EXTENSION); if ($info == 'mp4' || $info == 'mp3' || $info == 'dat' || $info == 'wav' || $info == 'ogg' || $info == 'wma' || $info == 'avi' || $info == 'mkv' || $info == 'rmvb' || $info == 'srt') { echo '<tr><td>'; echo '<span class="badge badge-success">Link</span> <a href="' . $DocInfo->url . '">' . urldecode($DocInfo->url) . '</a><br />'; echo '<span class="badge">Referer-link</span> <a href="' . $DocInfo->referer_url . '">' . urldecode($DocInfo->referer_url) . '</a>'; echo '</td></tr>'; flush(); } } } } $crawler = new MyCrawler(); $crawler->setURL($url); $crawler->addContentTypeReceiveRule("#text/html#"); //$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i"); $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(5000 * 1024); $crawler->go(); $report = $crawler->getProcessReport(); } ?> </tbody> </table>
// Print if the content of the document was be recieved or not if ($DocInfo->received == true) { echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb; } else { echo "Content not received" . $lb; } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://rentpad.com.ph/long-term-rentals/cebu/apartment"); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information)
$sql = "UPDATE news SET num_like = '{$nLike}' , num_comment = '{$nCommand}' WHERE id='{$id}'"; if ($conn->query($sql) === TRUE) { echo "Record updated successfully: id = " . $id . "<br>"; } else { echo "Error updating record: id = " . $id . " " . $conn->error . "<br>"; } $conn->close(); } //echo "ssadasdsa"; //flush (); } function handleHeaderInfo(PHPCrawlerResponseHeader $header) { } } $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://vnexpress.net"); $crawler->setCrawlingDepthLimit(1); $crawler->go(); $report = $crawler->getProcessReport(); //echo $report->; if (PHP_SAPI == "img") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb; echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
$p = $this->file_count . ".txt"; echo "Saving as " . $p . $lb; file_put_contents('crawled/' . $p, $DocInfo->content); } } } else { echo "Content not received" . $lb; } echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $url = "https://six.akademik.itb.ac.id/publik/displayprodikelas.php?semester=1&tahun=2014&th_kur=2013"; $crawler->setURL($url); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) //$crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short
unlink($HSource); } $sql = "UPDATE news SET num_like = '{$nLike}' , num_share = '{$nShare}' WHERE id='{$id}'"; if ($conn->query($sql) === TRUE) { echo "Record updated successfully: id = " . $id . "<br>"; } else { echo "Error updating record: id = " . $id . " " . $conn->error . "<br>"; } $conn->close(); } } function handleHeaderInfo(PHPCrawlerResponseHeader $header) { } } $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://www.24h.com.vn/am-thuc/cach-muoi-dua-cai-ngon-gion-vang-uom-c460a765028.html"); $crawler->setCrawlingDepthLimit(0); $crawler->enableCookieHandling(true); $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); //echo $report->; if (PHP_SAPI == "img") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb;
} } // the list of supported faculty codes $codes = ["101", "102", "103", "105", "160", "104", "106", "112", "114", "115", "119", "161", "198", "107", "116", "162", "121", "122", "123", "125", "164", "120", "128", "129", "151", "163", "130", "133", "134", "144", "167", "195", "132", "135", "165", "180", "181", "182", "183", "131", "136", "137", "169", "150", "153", "155", "157", "158", "166", "196", "152", "154", "199", "168", "170", "172", "173", "174", "175", "179", "190", "192", "197"]; for ($year = 2015; $year <= 2015; ++$year) { for ($semester = 2; $semester <= 2; ++$semester) { // if ($year === 2015 && $semester === 2) { // break; // } foreach ($codes as $code) { // Create new directory special for this faculty $directoryName = 'crawled/' . $year . '-' . $semester . '/' . $code; if (!file_exists($directoryName)) { mkdir($directoryName, 0755, true); } $crawler = new MyCrawler(); $crawler->code = $code; $crawler->file_count = 0; $crawler->directoryName = $directoryName; // URL to crawl $curriculum_year = $year < 2013 ? 2008 : 2013; $url = "https://six.akademik.itb.ac.id/publik/daftarkelas.php?ps=" . $code . "&semester=" . $semester . "&tahun=" . $year . "&th_kur=" . $curriculum_year; $crawler->setURL($url); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Don't let it back to the main page $res = $crawler->addURLFilterRule("#displayprodikelas.php# i"); // Thats enough, now here we go echo "Start crawling for year " . $year . " semester " . $semester;
} function handleHeaderInfo(PHPCrawlerResponseHeader $header) { // If the content-type of the document isn't "text/html" -> don't receive it. //if ($header->content_type != "text/html") //{ // return -1; //} //echo "$header->header_raw.<br>"; } } // Extend the class and override the handleDocumentInfo()-method // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://www.doisongphapluat.com/tin-tuc/tin-trong-nuoc/dai-hoi-cua-doan-ket-dan-chu-ky-cuong-doi-moi-a129965.html"); $crawler->setCrawlingDepthLimit(0); // Only receive content of files with content-type "text/html" //$crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures //$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) // $crawler->setTrafficLimit(1000000 * 1024); // // echo "URL: ".$PageInfo->url."<br />"; //$PageInfo = new PHPCrawlerDocumentInfo();
class MyCrawler extends PHPCrawler { function handleDocumentInfo($DocInfo) { // Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>"). if (PHP_SAPI == "cli") { $lb = "\n"; } else { $lb = "<br />"; } // Print the URL and the HTTP-status-Code echo "Page requested: " . $DocInfo->url . " (" . $DocInfo->http_status_code . ")" . $lb; flush(); } } $crawler = new MyCrawler(); $crawler->setURL("www.php.net"); $crawler->addContentTypeReceiveRule("#text/html#"); $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); $crawler->setPageLimit(50); // Set the page-limit to 50 for testing // Important for resumable scripts/processes! $crawler->enableResumption(); // At the firts start of the script retreive the crawler-ID and store it // (in a temporary file in this example) if (!file_exists("/tmp/mycrawlerid_for_php.net.tmp")) { $crawler_ID = $crawler->getCrawlerId(); file_put_contents("/tmp/mycrawlerid_for_php.net.tmp", $crawler_ID); } else { $crawler_ID = file_get_contents("/tmp/mycrawlerid_for_php.net.tmp"); $crawler->resume($crawler_ID);
unlink($HSource); } $sql = "UPDATE news SET num_like = '{$nLike}' WHERE id='{$id}'"; if ($conn->query($sql) === TRUE) { echo "Record updated successfully: id = " . $id . "<br>"; } else { echo "Error updating record: id = " . $id . " " . $conn->error . "<br>"; } $conn->close(); } } function handleHeaderInfo(PHPCrawlerResponseHeader $header) { } } $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("http://dantri.com.vn/the-gioi/my-phan-no-vi-video-thuy-thu-quy-goi-truoc-binh-sy-iran-20160119082404452.htm"); $crawler->setCrawlingDepthLimit(0); $crawler->enableCookieHandling(true); $crawler->go(); $report = $crawler->getProcessReport(); //echo $report->; if (PHP_SAPI == "img") { $lb = "\n"; } else { $lb = "<br />"; } echo "Summary:" . $lb; echo "Links followed: " . $report->links_followed . $lb; echo "Documents received: " . $report->files_received . $lb;
$sth->execute(); if ($sth->errorCode() != 0) { die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]); } $sth = $dbh->prepare("INSERT INTO Products (IdMerchant,Name,Description,Price,URL,Image,QueryDocument) VALUES (:IdMerchant,:Name::text,:Description::text,:Price,:URL,:Image,to_tsvector(:Name::text) || to_tsvector(:Description::text))"); $sth->bindValue(":IdMerchant", self::IdMerchant); $sth->bindValue(":Name", $title); $sth->bindValue(":Description", $description); $sth->bindValue(":Price", $newprice); $sth->bindValue(":URL", $url); $sth->bindValue(":Image", $image); $sth->execute(); if ($sth->errorCode() != 0) { die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]); } echo $URL . " added\n\n"; } flush(); } } $crawler = new MyCrawler(); $crawler->setURL("http://uae.souq.com/ae-en/"); $crawler->addReceiveContentType("#text/html#"); $crawler->setCrawlingDepthLimit(4); $crawler->setWorkingDirectory("./tmp/"); $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); while (true) { $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE); $report = $crawler->getProcessReport(); sleep(15 * 60); }
// Print if the content of the document was be recieved or not if ($DocInfo->received == true) { echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb; } else { echo "Content not received" . $lb; } // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example echo $lb; flush(); } } // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new MyCrawler(); // URL to crawl $crawler->setURL("localhost.p2.gta.charlie"); $crawler->obeyNoFollowTags(TRUE); $crawler->obeyRobotsTxt(TRUE); $crawler->enableAggressiveLinkSearch(FALSE); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png|css|js)([?].*)?\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(1000 * 1024); // Thats enough, now here we go