<?php require_once 'autoload.php'; class mycrawler extends Phpfetcher_Crawler_Default { public function handlePage($page) { print_r($page->getHyperLinks()); } } $crawler = new mycrawler(); $arrFetchJobs = array('blog.reetsee' => array('start_page' => 'http://blog.reetsee.com', 'link_rules' => array('/blog\\.reetsee\\.com/', '/wordpress/')), 'qq' => array('start_page' => 'http://news.qq.com', 'link_rules' => array('/(.*)\\/a\\/(\\d{8})\\/(\\d+)\\.htm/'), 'max_depth' => 4)); $crawler->setFetchJobs($arrFetchJobs)->run(); //$page->setConfField('url', 'http://tech.qq.com/a/20140715/073002.htm');
<?php //下面两行使得这个项目被下载下来后本文件能直接运行 $demo_include_path = dirname(__FILE__) . '/../'; set_include_path(get_include_path() . PATH_SEPARATOR . $demo_include_path); require_once 'phpfetcher.php'; class mycrawler extends Phpfetcher_Crawler_Default { public function handlePage($page) { //打印处当前页面的title $res = $page->sel('//title'); for ($i = 0; $i < count($res); ++$i) { echo $res[$i]->plaintext; echo "\n"; } } } $crawler = new mycrawler(); $arrJobs = array('qqnews' => array('start_page' => 'http://news.qq.com/a/20140927/026557.htm', 'link_rules' => array(), 'max_depth' => 1)); //$crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样 $crawler->setFetchJobs($arrJobs); $crawler->run();