Example #1
0
/**
 *  crawl method
 *  Create the crawler class object and set the options for crawling
 * @param type $u URL
 */
function crawl($u)
{
    $C = new MyCrawler();
    $C->setURL($u);
    $C->addContentTypeReceiveRule("#text/html#");
    /* Only receive HTML pages */
    $C->addURLFilterRule("#(jpg|gif|png|pdf|jpeg|svg|css|js)\$# i");
    /* We don't want to crawl non HTML pages */
    $C->setTrafficLimit(2000 * 1024);
    $C->obeyRobotsTxt(true);
    /* Should We follow robots.txt */
    $C->go();
}
Example #2
0
        */
        //    if ($DocInfo->http_status_code != "200") {
        //    if (!(stripos($DocInfo->url, 'http://www.thework.com') === FALSE)) {
        //echo $DocInfo->url . "," . $DocInfo->http_status_code  . "," . $DocInfo->referer_url . $lb;
        echo $DocInfo->http_status_code . "," . $DocInfo->referer_url . "," . $DocInfo->url . $lb;
        //  }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        //echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("dev.thework.com");
// Only receive content of files with content-type "text/html"
//$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
//$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
//$crawler->setTrafficLimit(1000 * 1024);
//$crawler->setFollowMode(1);
//$crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE);
if (PHP_SAPI == "cli") {
    $lb = "\n";
Example #3
0
                unlink($HSource);
            }
            $sql = "UPDATE news SET num_like = '{$nLike}' , num_comment = '{$nCommand}' WHERE id='{$id}'";
            if ($conn->query($sql) === TRUE) {
                echo "Record updated successfully: id = " . $id . "<br>";
            } else {
                echo "Error updating record: id = " . $id . " " . $conn->error . "<br>";
            }
            $conn->close();
        }
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
    }
}
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://vietnamnet.vn/vn/chinh-tri/285748/nhan-su-duoc-rut-hay-khong-do-dai-hoi-quyet-dinh.html");
$crawler->setCrawlingDepthLimit(0);
$crawler->enableCookieHandling(true);
$crawler->go();
$report = $crawler->getProcessReport();
//echo $report->;
if (PHP_SAPI == "img") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
Example #4
0
                unlink($HSource);
            }
            $sql = "UPDATE news SET num_like = '{$nLike}'   WHERE id='{$id}'";
            if ($conn->query($sql) === TRUE) {
                echo "Record updated successfully: id = " . $id . "<br>";
            } else {
                echo "Error updating record: id = " . $id . " " . $conn->error . "<br>";
            }
            $conn->close();
        }
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
    }
}
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://blogtamsu.vn/");
$crawler->setCrawlingDepthLimit(1);
$crawler->enableCookieHandling(true);
$crawler->go();
$report = $crawler->getProcessReport();
//echo $report->;
if (PHP_SAPI == "img") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
class MyCrawler extends PHPCrawler
{
    function handleDocumentInfo($DocInfo)
    {
        // Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").
        if (PHP_SAPI == "cli") {
            $lb = "\n";
        } else {
            $lb = "<br />";
        }
        // Print the URL and the HTTP-status-Code
        echo "Page requested: " . $DocInfo->url . " (" . $DocInfo->http_status_code . ")" . $lb;
        flush();
    }
}
$crawler = new MyCrawler();
$crawler->setURL("localhost.p2.gta.charlie");
$crawler->addContentTypeReceiveRule("#text/html#");
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png|css|js|ico|xml)([?].*)?\$# i");
$crawler->setPageLimit(3);
// Set the page-limit to 50 for testing
// Important for resumable scripts/processes!
$crawler->enableResumption();
// At the firts start of the script retreive the crawler-ID and store it
// (in a temporary file in this example)
if (!file_exists("/tmp/mycrawlerid_for_php.net.tmp")) {
    $crawler_ID = $crawler->getCrawlerId();
    file_put_contents("/tmp/mycrawlerid_for_php.net.tmp", $crawler_ID);
} else {
    $crawler_ID = file_get_contents("/tmp/mycrawlerid_for_php.net.tmp");
    $crawler->resume($crawler_ID);
Example #6
0
        // Print if the content of the document was be recieved or not
        if ($DocInfo->received == true) {
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("www.php.net");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
        // Print if the content of the document was be recieved or not
        if ($DocInfo->received == true) {
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl (the entry-page of the mysql-documentation on php.net)
$crawler->setURL("http://www.php.net/manual/en/book.mysql.php");
// Only receive content of documents with content-type "text/html"
$crawler->addReceiveContentType("#text/html#");
// Ignore links to pictures, css-documents etc (prefilter)
$crawler->addURLFilterRule("#\\.(jpg|gif|png|pdf|jpeg|css|js)\$# i");
// Every URL within the mysql-documentation looks like
// "http://www.php.net/manual/en/function.mysql-affected-rows.php"
// or "http://www.php.net/manual/en/mysql.setup.php", they all contain
// "http://www.php.net/manual/en/" followed by  "mysql" somewhere.
// So we add a corresponding follow-rule to the crawler.
$crawler->addURLFollowRule("#^http://www.php.net/manual/en/.*mysql[^a-z]# i");
// That's it, start crawling using 5 processes
$crawler->goMultiProcessed(5);
// At the end, after the process is finished, we print a short
Example #8
0
                        // Insert the data in the database
                        $conn->query($sql);
                    }
                    // end if
                } else {
                    echo 'skip these pages <br>';
                }
            }
            $html->clear();
            unset($html);
        }
        flush();
    }
}
// Bring the crawler out
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://harcourts.co.nz/");
// Crawl only URL's with the word property in them
$crawler->addURLFollowRule("#property# i");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 10mb
$crawler->setTrafficLimit(1000 * 104857600);
// Start crawler
$crawler->go();
// At the end, after the process is finished print report
Example #9
0
            }
            if (!$this->url_exists($link)) {
                unset($this->links[$ldx]);
                continue;
            }
            $this->links[$ldx] = $link;
        }
        // Final re-order
        $this->links = array_values($this->links);
        return $this->links;
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler($_SESSION['crawler']['domain']);
$crawler->setFollowMode(2);
$crawler->addContentTypeReceiveRule("#text/html#");
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
$crawler->enableCookieHandling(true);
if ($_SESSION['crawler']['respect_robots_txt'] == true) {
    $crawler->obeyRobotsTxt(true, $_SESSION['crawler']['domain'] . '/robots.txt');
    $crawler->obeyNoFollowTags(true);
}
$crawler->enableAggressiveLinkSearch(false);
$crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::ALL_SPECIAL_SECTIONS);
$crawler->addLinkSearchContentType("#text/html# i");
$crawler->setLinkExtractionTags(array('href'));
$crawler->setUserAgentString('Crawl_Scrape_Solr_Index/1.0)');
// no data on poage yet
if ($_SESSION['crawler']['auth'] == true) {
Example #10
0
            $sql = "UPDATE news SET num_like = '{$nLike}' , num_comment = '{$nCommand}' WHERE id='{$id}'";
            if ($conn->query($sql) === TRUE) {
                echo "Record updated successfully: id = " . $id . "<br>";
            } else {
                echo "Error updating record: id = " . $id . " " . $conn->error . "<br>";
            }
            $conn->close();
        }
        //echo "ssadasdsa";
        //flush ();
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
    }
}
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://ngoisao.net/tin-tuc/thu-gian/an-choi/dia-chi-cuoi-tuan-hai-khong-gian-de-tron-o-sapa-3346068.html");
$crawler->setCrawlingDepthLimit(0);
$crawler->enableCookieHandling(true);
$crawler->go();
$report = $crawler->getProcessReport();
//echo $report->;
if (PHP_SAPI == "img") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
        // Print if the content of the document was be recieved or not
        if ($DocInfo->received == true) {
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl (the entry-page of the mysql-documentation on php.net)
$url = "https://six.akademik.itb.ac.id/publik/displayprodikelas.php?semester=1&tahun=2014&th_kur=2013";
$crawler->setURL($url);
// Only receive content of documents with content-type "text/html"
//$crawler->addReceiveContentType("#text/html#");
$crawler->addContentTypeReceiveRule("#text/html#");
$crawler->addContentTypeReceiveRule("#text/plain#");
// Ignore links to pictures, css-documents etc (prefilter)
$crawler->addURLFilterRule("#\\.(jpg|gif|png|pdf|jpeg|css|js)\$# i");
// Every URL within the mysql-documentation looks like
// "http://php.net/manual/en/function.mysql-affected-rows.php"
// or "http://php.net/manual/en/mysql.setup.php", they all contain
// "http://php.net/manual/en/" followed by  "mysql" somewhere.
// So we add a corresponding follow-rule to the crawler.
//$crawler->addURLFollowRule("#^http://php.net/manual/en/.*mysql[^a-z]# i");
Example #12
0
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
        // If the content-type of the document isn't "text/html" -> don't receive it.
        //if ($header->content_type != "text/html")
        //{
        //  return -1;
        //}
        //echo "$header->header_raw.<br>";
    }
}
// Extend the class and override the handleDocumentInfo()-method
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("https://www.youtube.com");
$crawler->setCrawlingDepthLimit(0);
// Only receive content of files with content-type "text/html"
//$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
//$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
//     $crawler->setTrafficLimit(1000000 * 1024);
//
// echo "URL: ".$PageInfo->url."<br />";
//$PageInfo = new PHPCrawlerDocumentInfo();
Example #13
0
    {
        function handleDocumentInfo($DocInfo)
        {
            if ($DocInfo->http_status_code == 200) {
                $info = pathinfo($DocInfo->url, PATHINFO_EXTENSION);
                if ($info == 'mp4' || $info == 'mp3' || $info == 'dat' || $info == 'wav' || $info == 'ogg' || $info == 'wma' || $info == 'avi' || $info == 'mkv' || $info == 'rmvb' || $info == 'srt') {
                    echo '<tr><td>';
                    echo '<span class="badge badge-success">Link</span> <a href="' . $DocInfo->url . '">' . urldecode($DocInfo->url) . '</a><br />';
                    echo '<span class="badge">Referer-link</span> <a href="' . $DocInfo->referer_url . '">' . urldecode($DocInfo->referer_url) . '</a>';
                    echo '</td></tr>';
                    flush();
                }
            }
        }
    }
    $crawler = new MyCrawler();
    $crawler->setURL($url);
    $crawler->addContentTypeReceiveRule("#text/html#");
    //$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
    $crawler->enableCookieHandling(true);
    // Set the traffic-limit to 1 MB (in bytes,
    // for testing we dont want to "suck" the whole site)
    $crawler->setTrafficLimit(5000 * 1024);
    $crawler->go();
    $report = $crawler->getProcessReport();
}
?>

  </tbody>
</table>
Example #14
0
        // Print if the content of the document was be recieved or not
        if ($DocInfo->received == true) {
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://rentpad.com.ph/long-term-rentals/cebu/apartment");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
Example #15
0
            $sql = "UPDATE news SET num_like = '{$nLike}' , num_comment = '{$nCommand}' WHERE id='{$id}'";
            if ($conn->query($sql) === TRUE) {
                echo "Record updated successfully: id = " . $id . "<br>";
            } else {
                echo "Error updating record: id = " . $id . " " . $conn->error . "<br>";
            }
            $conn->close();
        }
        //echo "ssadasdsa";
        //flush ();
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
    }
}
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://vnexpress.net");
$crawler->setCrawlingDepthLimit(1);
$crawler->go();
$report = $crawler->getProcessReport();
//echo $report->;
if (PHP_SAPI == "img") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
Example #16
0
                    $p = $this->file_count . ".txt";
                    echo "Saving as " . $p . $lb;
                    file_put_contents('crawled/' . $p, $DocInfo->content);
                }
            }
        } else {
            echo "Content not received" . $lb;
        }
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$url = "https://six.akademik.itb.ac.id/publik/displayprodikelas.php?semester=1&tahun=2014&th_kur=2013";
$crawler->setURL($url);
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
//$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
Example #17
0
                unlink($HSource);
            }
            $sql = "UPDATE news SET num_like = '{$nLike}' , num_share = '{$nShare}' WHERE id='{$id}'";
            if ($conn->query($sql) === TRUE) {
                echo "Record updated successfully: id = " . $id . "<br>";
            } else {
                echo "Error updating record: id = " . $id . " " . $conn->error . "<br>";
            }
            $conn->close();
        }
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
    }
}
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://www.24h.com.vn/am-thuc/cach-muoi-dua-cai-ngon-gion-vang-uom-c460a765028.html");
$crawler->setCrawlingDepthLimit(0);
$crawler->enableCookieHandling(true);
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
//echo $report->;
if (PHP_SAPI == "img") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
Example #18
0
    }
}
// the list of supported faculty codes
$codes = ["101", "102", "103", "105", "160", "104", "106", "112", "114", "115", "119", "161", "198", "107", "116", "162", "121", "122", "123", "125", "164", "120", "128", "129", "151", "163", "130", "133", "134", "144", "167", "195", "132", "135", "165", "180", "181", "182", "183", "131", "136", "137", "169", "150", "153", "155", "157", "158", "166", "196", "152", "154", "199", "168", "170", "172", "173", "174", "175", "179", "190", "192", "197"];
for ($year = 2015; $year <= 2015; ++$year) {
    for ($semester = 2; $semester <= 2; ++$semester) {
        // if ($year === 2015 && $semester === 2) {
        //     break;
        // }
        foreach ($codes as $code) {
            // Create new directory special for this faculty
            $directoryName = 'crawled/' . $year . '-' . $semester . '/' . $code;
            if (!file_exists($directoryName)) {
                mkdir($directoryName, 0755, true);
            }
            $crawler = new MyCrawler();
            $crawler->code = $code;
            $crawler->file_count = 0;
            $crawler->directoryName = $directoryName;
            // URL to crawl
            $curriculum_year = $year < 2013 ? 2008 : 2013;
            $url = "https://six.akademik.itb.ac.id/publik/daftarkelas.php?ps=" . $code . "&semester=" . $semester . "&tahun=" . $year . "&th_kur=" . $curriculum_year;
            $crawler->setURL($url);
            // Only receive content of files with content-type "text/html"
            $crawler->addContentTypeReceiveRule("#text/html#");
            // Ignore links to pictures, dont even request pictures
            $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
            // Don't let it back to the main page
            $res = $crawler->addURLFilterRule("#displayprodikelas.php# i");
            // Thats enough, now here we go
            echo "Start crawling for year " . $year . " semester " . $semester;
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
        // If the content-type of the document isn't "text/html" -> don't receive it.
        //if ($header->content_type != "text/html")
        //{
        //  return -1;
        //}
        //echo "$header->header_raw.<br>";
    }
}
// Extend the class and override the handleDocumentInfo()-method
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://www.doisongphapluat.com/tin-tuc/tin-trong-nuoc/dai-hoi-cua-doan-ket-dan-chu-ky-cuong-doi-moi-a129965.html");
$crawler->setCrawlingDepthLimit(0);
// Only receive content of files with content-type "text/html"
//$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
//$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
//     $crawler->setTrafficLimit(1000000 * 1024);
//
// echo "URL: ".$PageInfo->url."<br />";
//$PageInfo = new PHPCrawlerDocumentInfo();
Example #20
0
class MyCrawler extends PHPCrawler
{
    function handleDocumentInfo($DocInfo)
    {
        // Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").
        if (PHP_SAPI == "cli") {
            $lb = "\n";
        } else {
            $lb = "<br />";
        }
        // Print the URL and the HTTP-status-Code
        echo "Page requested: " . $DocInfo->url . " (" . $DocInfo->http_status_code . ")" . $lb;
        flush();
    }
}
$crawler = new MyCrawler();
$crawler->setURL("www.php.net");
$crawler->addContentTypeReceiveRule("#text/html#");
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
$crawler->setPageLimit(50);
// Set the page-limit to 50 for testing
// Important for resumable scripts/processes!
$crawler->enableResumption();
// At the firts start of the script retreive the crawler-ID and store it
// (in a temporary file in this example)
if (!file_exists("/tmp/mycrawlerid_for_php.net.tmp")) {
    $crawler_ID = $crawler->getCrawlerId();
    file_put_contents("/tmp/mycrawlerid_for_php.net.tmp", $crawler_ID);
} else {
    $crawler_ID = file_get_contents("/tmp/mycrawlerid_for_php.net.tmp");
    $crawler->resume($crawler_ID);
Example #21
0
                unlink($HSource);
            }
            $sql = "UPDATE news SET num_like = '{$nLike}'  WHERE id='{$id}'";
            if ($conn->query($sql) === TRUE) {
                echo "Record updated successfully: id = " . $id . "<br>";
            } else {
                echo "Error updating record: id = " . $id . " " . $conn->error . "<br>";
            }
            $conn->close();
        }
    }
    function handleHeaderInfo(PHPCrawlerResponseHeader $header)
    {
    }
}
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://dantri.com.vn/the-gioi/my-phan-no-vi-video-thuy-thu-quy-goi-truoc-binh-sy-iran-20160119082404452.htm");
$crawler->setCrawlingDepthLimit(0);
$crawler->enableCookieHandling(true);
$crawler->go();
$report = $crawler->getProcessReport();
//echo $report->;
if (PHP_SAPI == "img") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
Example #22
0
            $sth->execute();
            if ($sth->errorCode() != 0) {
                die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]);
            }
            $sth = $dbh->prepare("INSERT INTO Products (IdMerchant,Name,Description,Price,URL,Image,QueryDocument) VALUES (:IdMerchant,:Name::text,:Description::text,:Price,:URL,:Image,to_tsvector(:Name::text) || to_tsvector(:Description::text))");
            $sth->bindValue(":IdMerchant", self::IdMerchant);
            $sth->bindValue(":Name", $title);
            $sth->bindValue(":Description", $description);
            $sth->bindValue(":Price", $newprice);
            $sth->bindValue(":URL", $url);
            $sth->bindValue(":Image", $image);
            $sth->execute();
            if ($sth->errorCode() != 0) {
                die("! erro linha: " . __LINE__ . "\n" . $sth->errorInfo()[2]);
            }
            echo $URL . " added\n\n";
        }
        flush();
    }
}
$crawler = new MyCrawler();
$crawler->setURL("http://uae.souq.com/ae-en/");
$crawler->addReceiveContentType("#text/html#");
$crawler->setCrawlingDepthLimit(4);
$crawler->setWorkingDirectory("./tmp/");
$crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE);
while (true) {
    $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE);
    $report = $crawler->getProcessReport();
    sleep(15 * 60);
}
Example #23
0
        // Print if the content of the document was be recieved or not
        if ($DocInfo->received == true) {
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
        } else {
            echo "Content not received" . $lb;
        }
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("localhost.p2.gta.charlie");
$crawler->obeyNoFollowTags(TRUE);
$crawler->obeyRobotsTxt(TRUE);
$crawler->enableAggressiveLinkSearch(FALSE);
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png|css|js)([?].*)?\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go