示例#1
0
<?php

require_once 'autoload.php';
class mycrawler extends Phpfetcher_Crawler_Default
{
    public function handlePage($page)
    {
        print_r($page->getHyperLinks());
    }
}
$crawler = new mycrawler();
$arrFetchJobs = array('blog.reetsee' => array('start_page' => 'http://blog.reetsee.com', 'link_rules' => array('/blog\\.reetsee\\.com/', '/wordpress/')), 'qq' => array('start_page' => 'http://news.qq.com', 'link_rules' => array('/(.*)\\/a\\/(\\d{8})\\/(\\d+)\\.htm/'), 'max_depth' => 4));
$crawler->setFetchJobs($arrFetchJobs)->run();
//$page->setConfField('url', 'http://tech.qq.com/a/20140715/073002.htm');
示例#2
0
<?php

//下面两行使得这个项目被下载下来后本文件能直接运行
$demo_include_path = dirname(__FILE__) . '/../';
set_include_path(get_include_path() . PATH_SEPARATOR . $demo_include_path);
require_once 'phpfetcher.php';
class mycrawler extends Phpfetcher_Crawler_Default
{
    public function handlePage($page)
    {
        //打印处当前页面的title
        $res = $page->sel('//title');
        for ($i = 0; $i < count($res); ++$i) {
            echo $res[$i]->plaintext;
            echo "\n";
        }
    }
}
$crawler = new mycrawler();
$arrJobs = array('qqnews' => array('start_page' => 'http://news.qq.com/a/20140927/026557.htm', 'link_rules' => array(), 'max_depth' => 1));
//$crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样
$crawler->setFetchJobs($arrJobs);
$crawler->run();