Exemple #1
0
/**
 *  crawl method
 *  Create the crawler class object and set the options for crawling
 * @param type $u URL
 */
function crawl($u)
{
    $C = new MyCrawler();
    $C->setURL($u);
    $C->addContentTypeReceiveRule("#text/html#");
    /* Only receive HTML pages */
    $C->addURLFilterRule("#(jpg|gif|png|pdf|jpeg|svg|css|js)\$# i");
    /* We don't want to crawl non HTML pages */
    $C->setTrafficLimit(2000 * 1024);
    $C->obeyRobotsTxt(true);
    /* Should We follow robots.txt */
    $C->go();
}
            $sql = "UPDATE news SET num_like = '{$nLike}' , num_comment = '{$nCommand}' WHERE id='{$id}'";
            if ($conn->query($sql) === TRUE) {
                echo "Record updated successfully: id = " . $id . "<br>";
            } else {
                echo "Error updating record: id = " . $id . " " . $conn->error . "<br>";
            }
            $conn->close();
        }
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
    }
}
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://vietnamnet.vn/vn/chinh-tri/285748/nhan-su-duoc-rut-hay-khong-do-dai-hoi-quyet-dinh.html");
$crawler->setCrawlingDepthLimit(0);
$crawler->enableCookieHandling(true);
$crawler->go();
$report = $crawler->getProcessReport();
//echo $report->;
if (PHP_SAPI == "img") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
            $sql = "UPDATE news SET num_like = '{$nLike}'   WHERE id='{$id}'";
            if ($conn->query($sql) === TRUE) {
                echo "Record updated successfully: id = " . $id . "<br>";
            } else {
                echo "Error updating record: id = " . $id . " " . $conn->error . "<br>";
            }
            $conn->close();
        }
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
    }
}
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://blogtamsu.vn/");
$crawler->setCrawlingDepthLimit(1);
$crawler->enableCookieHandling(true);
$crawler->go();
$report = $crawler->getProcessReport();
//echo $report->;
if (PHP_SAPI == "img") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("localhost.p2.gta.charlie");
$crawler->obeyNoFollowTags(TRUE);
$crawler->obeyRobotsTxt(TRUE);
$crawler->enableAggressiveLinkSearch(FALSE);
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png|css|js)([?].*)?\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
Exemple #5
0
 //     break;
 // }
 foreach ($codes as $code) {
     // Create new directory special for this faculty
     $directoryName = 'crawled/' . $year . '-' . $semester . '/' . $code;
     if (!file_exists($directoryName)) {
         mkdir($directoryName, 0755, true);
     }
     $crawler = new MyCrawler();
     $crawler->code = $code;
     $crawler->file_count = 0;
     $crawler->directoryName = $directoryName;
     // URL to crawl
     $curriculum_year = $year < 2013 ? 2008 : 2013;
     $url = "https://six.akademik.itb.ac.id/publik/daftarkelas.php?ps=" . $code . "&semester=" . $semester . "&tahun=" . $year . "&th_kur=" . $curriculum_year;
     $crawler->setURL($url);
     // Only receive content of files with content-type "text/html"
     $crawler->addContentTypeReceiveRule("#text/html#");
     // Ignore links to pictures, dont even request pictures
     $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
     // Don't let it back to the main page
     $res = $crawler->addURLFilterRule("#displayprodikelas.php# i");
     // Thats enough, now here we go
     echo "Start crawling for year " . $year . " semester " . $semester;
     $crawler->go();
     // At the end, after the process is finished, we print a short
     // report (see method getProcessReport() for more information)
     $report = $crawler->getProcessReport();
     echo "Summary for " . $code . ":" . $lb;
     echo "Links followed: " . $report->links_followed . $lb;
     echo "Documents received: " . $report->files_received . $lb;
Exemple #6
0
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("www.php.net");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {
Exemple #7
0
            $sth->execute();
            if ($sth->errorCode() != 0) {
                die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]);
            }
            // if we got here without dying, then we're good to go
            echo $url . " added\n\n";
        }
        // just flush buffer so we can keep up the progress
        flush();
    }
}
//---------------------------------------------------------------------------------------------
// Now, to the implementation. First, create the object.
$crawler = new MyCrawler();
// Now, set the URL. Try to be specified about targeting, because sometimes the main page does some weird redirections.
$crawler->setURL("http://markavip.com/ae/");
// Let's filter only HTML and the deep to 4 (enough for our example).
$crawler->addReceiveContentType("#text/html#");
$crawler->setCrawlingDepthLimit(4);
// We need a temp dir. Could be /tmp, but I like to keep things together.
$crawler->setWorkingDirectory("./tmp/");
$crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE);
// This is used to resume old crawlings. Since we're doing this in a loop, I don't recommend.
/*
$crawler->enableResumption(); 
if (!file_exists("./tmp/markavip_id.tmp")) 
{ 
  $crawler_ID = $crawler->getCrawlerId(); 
  file_put_contents("./tmp/markavip_id.tmp", $crawler_ID); 
} 
else 
            if ($conn->query($sql) === TRUE) {
                echo "Record updated successfully: id = " . $id . "<br>";
            } else {
                echo "Error updating record: id = " . $id . " " . $conn->error . "<br>";
            }
            $conn->close();
        }
        //echo "ssadasdsa";
        //flush ();
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
    }
}
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://vnexpress.net");
$crawler->setCrawlingDepthLimit(1);
$crawler->go();
$report = $crawler->getProcessReport();
//echo $report->;
if (PHP_SAPI == "img") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
Exemple #9
0
            $sth->execute();
            if ($sth->errorCode() != 0) {
                die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]);
            }
            $sth = $dbh->prepare("INSERT INTO Products (IdMerchant,Name,Description,Price,URL,Image,QueryDocument) VALUES (:IdMerchant,:Name::text,:Description::text,:Price,:URL,:Image,to_tsvector(:Name::text) || to_tsvector(:Description::text))");
            $sth->bindValue(":IdMerchant", self::IdMerchant);
            $sth->bindValue(":Name", $title);
            $sth->bindValue(":Description", $description);
            $sth->bindValue(":Price", $newprice);
            $sth->bindValue(":URL", $url);
            $sth->bindValue(":Image", $image);
            $sth->execute();
            if ($sth->errorCode() != 0) {
                die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]);
            }
            echo $url . " added\n\n";
        }
        flush();
    }
}
$crawler = new MyCrawler();
$crawler->setURL("https://en-sa.namshi.com/");
$crawler->addReceiveContentType("#text/html#");
$crawler->setCrawlingDepthLimit(4);
$crawler->setWorkingDirectory("./tmp/");
$crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE);
while (true) {
    $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE);
    $report = $crawler->getProcessReport();
    sleep(15 * 60);
}
            echo "Document Content: " . $DocInfo->content . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://www.ttm-iitd.in/");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
$crawler->setPageLimit(1);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
Exemple #11
0
                echo "Record updated successfully: id = " . $id . "<br>";
            } else {
                echo "Error updating record: id = " . $id . " " . $conn->error . "<br>";
            }
            $conn->close();
        }
        //echo "ssadasdsa";
        //flush ();
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
    }
}
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://ngoisao.net/tin-tuc/thu-gian/an-choi/dia-chi-cuoi-tuan-hai-khong-gian-de-tron-o-sapa-3346068.html");
$crawler->setCrawlingDepthLimit(0);
$crawler->enableCookieHandling(true);
$crawler->go();
$report = $crawler->getProcessReport();
//echo $report->;
if (PHP_SAPI == "img") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
Exemple #12
0
    {
        // If the content-type of the document isn't "text/html" -> don't receive it.
        //if ($header->content_type != "text/html")
        //{
        //  return -1;
        //}
        //echo "$header->header_raw.<br>";
    }
}
// Extend the class and override the handleDocumentInfo()-method
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("https://www.youtube.com");
$crawler->setCrawlingDepthLimit(0);
// Only receive content of files with content-type "text/html"
//$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
//$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
//     $crawler->setTrafficLimit(1000000 * 1024);
//
// echo "URL: ".$PageInfo->url."<br />";
//$PageInfo = new PHPCrawlerDocumentInfo();
//$crawler->handleDocumentInfo($PageInfo);
//$crawler->handleDocumentInfo($PageInfo);
Exemple #13
0
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://rentpad.com.ph/long-term-rentals/cebu/apartment");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {
Exemple #14
0
        //    if (!(stripos($DocInfo->url, 'http://www.thework.com') === FALSE)) {
        //echo $DocInfo->url . "," . $DocInfo->http_status_code  . "," . $DocInfo->referer_url . $lb;
        echo $DocInfo->http_status_code . "," . $DocInfo->referer_url . "," . $DocInfo->url . $lb;
        //  }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        //echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("dev.thework.com");
// Only receive content of files with content-type "text/html"
//$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
//$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
//$crawler->setTrafficLimit(1000 * 1024);
//$crawler->setFollowMode(1);
//$crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE);
if (PHP_SAPI == "cli") {
    $lb = "\n";
} else {
    $lb = "<br />";
Exemple #15
0
            $sql = "UPDATE news SET num_like = '{$nLike}' , num_share = '{$nShare}' WHERE id='{$id}'";
            if ($conn->query($sql) === TRUE) {
                echo "Record updated successfully: id = " . $id . "<br>";
            } else {
                echo "Error updating record: id = " . $id . " " . $conn->error . "<br>";
            }
            $conn->close();
        }
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
    }
}
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://www.24h.com.vn/am-thuc/cach-muoi-dua-cai-ngon-gion-vang-uom-c460a765028.html");
$crawler->setCrawlingDepthLimit(0);
$crawler->enableCookieHandling(true);
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
//echo $report->;
if (PHP_SAPI == "img") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
Exemple #16
0
            if ($sth->errorCode() != 0) {
                die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]);
            }
            $sth = $dbh->prepare("INSERT INTO Products (IdMerchant,Name,Description,OldPrice,Price,URL,Image,QueryDocument) VALUES (:IdMerchant,:Name::text,:Description::text,:OldPrice,:Price,:URL,:Image,to_tsvector(:Name::text) || to_tsvector(:Description::text))");
            $sth->bindValue(":IdMerchant", self::IdMerchant);
            $sth->bindValue(":Name", $title);
            $sth->bindValue(":Description", $description);
            $sth->bindValue(":OldPrice", $oldprice);
            $sth->bindValue(":Price", $newprice);
            $sth->bindValue(":URL", $url);
            $sth->bindValue(":Image", $image);
            $sth->execute();
            if ($sth->errorCode() != 0) {
                die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]);
            }
            echo $URL . " added\n\n";
        }
        flush();
    }
}
$crawler = new MyCrawler();
$crawler->setURL("http://wysada.com/en/");
$crawler->addReceiveContentType("#text/html#");
$crawler->setCrawlingDepthLimit(4);
$crawler->setWorkingDirectory("./tmp/");
$crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE);
while (true) {
    $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE);
    $report = $crawler->getProcessReport();
    sleep(15 * 60);
}
    {
        // If the content-type of the document isn't "text/html" -> don't receive it.
        //if ($header->content_type != "text/html")
        //{
        //  return -1;
        //}
        //echo "$header->header_raw.<br>";
    }
}
// Extend the class and override the handleDocumentInfo()-method
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://www.doisongphapluat.com/tin-tuc/tin-trong-nuoc/dai-hoi-cua-doan-ket-dan-chu-ky-cuong-doi-moi-a129965.html");
$crawler->setCrawlingDepthLimit(0);
// Only receive content of files with content-type "text/html"
//$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
//$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
//     $crawler->setTrafficLimit(1000000 * 1024);
//
// echo "URL: ".$PageInfo->url."<br />";
//$PageInfo = new PHPCrawlerDocumentInfo();
//$crawler->handleDocumentInfo($PageInfo);
//$crawler->handleDocumentInfo($PageInfo);
Exemple #18
0
                    }
                    // end if
                } else {
                    echo 'skip these pages <br>';
                }
            }
            $html->clear();
            unset($html);
        }
        flush();
    }
}
// Bring the crawler out
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://harcourts.co.nz/");
// Crawl only URL's with the word property in them
$crawler->addURLFollowRule("#property# i");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 10mb
$crawler->setTrafficLimit(1000 * 104857600);
// Start crawler
$crawler->go();
// At the end, after the process is finished print report
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {
Exemple #19
0
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("www.sysfoghost.in");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl (the entry-page of the mysql-documentation on php.net)
$crawler->setURL("http://www.php.net/manual/en/book.mysql.php");
// Only receive content of documents with content-type "text/html"
$crawler->addReceiveContentType("#text/html#");
// Ignore links to pictures, css-documents etc (prefilter)
$crawler->addURLFilterRule("#\\.(jpg|gif|png|pdf|jpeg|css|js)\$# i");
// Every URL within the mysql-documentation looks like
// "http://www.php.net/manual/en/function.mysql-affected-rows.php"
// or "http://www.php.net/manual/en/mysql.setup.php", they all contain
// "http://www.php.net/manual/en/" followed by  "mysql" somewhere.
// So we add a corresponding follow-rule to the crawler.
$crawler->addURLFollowRule("#^http://www.php.net/manual/en/.*mysql[^a-z]# i");
// That's it, start crawling using 5 processes
$crawler->goMultiProcessed(5);
// At the end, after the process is finished, we print a short
// report (see method getReport() for more information)
$report = $crawler->getProcessReport();
Exemple #21
0
            $sql = "UPDATE news SET num_like = '{$nLike}'  WHERE id='{$id}'";
            if ($conn->query($sql) === TRUE) {
                echo "Record updated successfully: id = " . $id . "<br>";
            } else {
                echo "Error updating record: id = " . $id . " " . $conn->error . "<br>";
            }
            $conn->close();
        }
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
    }
}
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://dantri.com.vn/the-gioi/my-phan-no-vi-video-thuy-thu-quy-goi-truoc-binh-sy-iran-20160119082404452.htm");
$crawler->setCrawlingDepthLimit(0);
$crawler->enableCookieHandling(true);
$crawler->go();
$report = $crawler->getProcessReport();
//echo $report->;
if (PHP_SAPI == "img") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
Exemple #22
0
            $sth->execute();
            if ($sth->errorCode() != 0) {
                die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]);
            }
            $sth = $dbh->prepare("INSERT INTO Products (IdMerchant,Name,Description,Price,URL,Image,QueryDocument) VALUES (:IdMerchant,:Name::text,:Description::text,:Price,:URL,:Image,to_tsvector(:Name::text) || to_tsvector(:Description::text))");
            $sth->bindValue(":IdMerchant", self::IdMerchant);
            $sth->bindValue(":Name", $title);
            $sth->bindValue(":Description", $description);
            $sth->bindValue(":Price", $newprice);
            $sth->bindValue(":URL", $url);
            $sth->bindValue(":Image", $image);
            $sth->execute();
            if ($sth->errorCode() != 0) {
                die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]);
            }
            echo $URL . " added\n\n";
        }
        flush();
    }
}
$crawler = new MyCrawler();
$crawler->setURL("http://uae.souq.com/ae-en/");
$crawler->addReceiveContentType("#text/html#");
$crawler->setCrawlingDepthLimit(4);
$crawler->setWorkingDirectory("./tmp/");
$crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE);
while (true) {
    $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE);
    $report = $crawler->getProcessReport();
    sleep(15 * 60);
}
Exemple #23
0
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://www.divxturka.net/");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {