Beispiel #1
0
 public function sel($pattern = '', $idx = NULL, $node = NULL)
 {
     $strMethodName = 'find';
     if (method_exists($this->_dom, $strMethodName)) {
         return $this->_dom->find($pattern, $idx);
     } else {
         Phpfetcher_Log::warning("method {$strMethodName} not exists");
         return FALSE;
     }
 }
Beispiel #2
0
 /**
  * @author xuruiqi
  * @param
  *      array $arrInput : //运行设定
  *          string 'page_class_name' : //指定要使用的Page类型,必须是
  *                                     //Phpfetcher_Page_Abstract的
  *                                     //子类
  *          [array 'page_conf'] : //Page调用setConf时的输入参数,可选
  * @return
  *      obj $this
  * @desc
  */
 public function &run($arrInput = array())
 {
     if (empty($this->_arrFetchJobs)) {
         Phpfetcher_Log::warning("No fetch jobs.");
         return $this;
     }
     //构建Page对象
     $objPage = NULL;
     $strPageClassName = self::DEFAULT_PAGE_CLASS;
     if (!empty($arrInput['page_class_name'])) {
         $strPageClassName = strval($arrInput['page_class_name']);
     }
     try {
         if (!class_exists($strPageClassName, TRUE)) {
             throw new Exception("[{$strPageClassName}] class not exists!");
         }
         $objPage = new $strPageClassName();
         if (!$objPage instanceof Phpfetcher_Page_Abstract) {
             throw new Exception("[{$strPageClassName}] is not an instance of " . self::ABSTRACT_PAGE_CLASS);
         }
     } catch (Exception $e) {
         Phpfetcher_Log::fatal($e->getMessage());
         return $this;
     }
     //初始化Page对象
     $arrPageConf = empty($arrInput['page_conf']) ? array() : $arrInput['page_conf'];
     $objPage->init();
     if (!empty($arrPageConf)) {
         if (isset($arrPageConf['url'])) {
             unset($arrPageConf['url']);
         }
         $objPage->setConf($arrPageConf);
     }
     //遍历任务队列
     foreach ($this->_arrFetchJobs as $job_name => $job_rules) {
         if (!$this->_isJobValid($job_rules)) {
             Phpfetcher_Log::warning("Job rules invalid [" . serialize($job_rules) . "]");
             continue;
         }
         $intDepth = 0;
         $intPageNum = 0;
         $arrIndice = array(0, 1);
         $arrJobs = array(0 => array($job_rules['start_page']), 1 => array());
         //开始爬取
         while (!empty($arrJobs[$arrIndice[0]]) && ($job_rules['max_depth'] === -1 || $intDepth < $job_rules['max_depth']) && ($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) {
             $intDepth += 1;
             $intPopIndex = $arrIndice[0];
             $intPushIndex = $arrIndice[1];
             $arrJobs[$intPushIndex] = array();
             foreach ($arrJobs[$intPopIndex] as $url) {
                 if (!($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) {
                     break;
                 }
                 $objPage->setUrl($url);
                 $objPage->read();
                 //获取所有的超链接
                 $arrLinks = $objPage->getHyperLinks();
                 //解析当前URL的各个组成部分,以应对超链接中存在站内链接
                 //的情况,如"/entry"等形式的URL
                 $strCurUrl = $objPage->getUrl();
                 $arrUrlComponents = parse_url($strCurUrl);
                 //匹配超链接
                 foreach ($job_rules['link_rules'] as $link_rule) {
                     foreach ($arrLinks as $link) {
                         //if (preg_match($link_rule, $link) === 1
                         //        && !$this->getHash($link)) {
                         //    $this->setHash($link, true);
                         //    $arrJobs[$intPushIndex][] = $link;
                         //}
                         if (preg_match($link_rule, $link) === 1 && !$this->getHash($link)) {
                             //拼出实际的URL
                             $real_link = $link;
                             //不使用strpos,防止扫描整个字符串
                             //这里只需要扫描前6个字符即可
                             $colon_pos = false;
                             for ($i = 0; $i <= 5; ++$i) {
                                 if ($link[$i] == ':') {
                                     $colon_pos = $i;
                                     break;
                                 }
                             }
                             if ($colon_pos === false || !$this->_objSchemeTrie->has(substr($link, 0, $colon_pos))) {
                                 //将站内地址转换为完整地址
                                 $real_link = $arrUrlComponents['scheme'] . "://" . $arrUrlComponents['host'] . (isset($arrUrlComponents['port']) && strlen($arrUrlComponents['port']) != 0 ? ":{$arrUrlComponents['port']}" : "") . ($link[0] == '/' ? $link : "/{$link}");
                             }
                             $this->setHash($link, true);
                             $this->setHash($real_link, true);
                             $arrJobs[$intPushIndex][] = $real_link;
                         }
                     }
                 }
                 //由用户实现handlePage函数
                 $objPage->setExtraInfo(array('job_name' => $job_name));
                 $this->handlePage($objPage);
                 $intPageNum += 1;
             }
             if (!empty($this->_arrAdditionalUrls)) {
                 $arrJobs[$intPushIndex] = array_merge($arrJobs[$intPushIndex], $this->_arrAdditionalUrls);
                 $this->_arrAdditionalUrls = array();
             }
             self::_swap($arrIndice[0], $arrIndice[1]);
         }
     }
     return $this;
 }
Beispiel #3
0
 /**
  * @author xuruiqi
  * @param
  *      string $strPath : xpath's path
  *      [DOMNode $contextnode : The optional contextnode can be specified for doing relative XPath queries. By default, the queries are relative to the root element.]
  *
  * @return
  *      DOMNodelist : DOMNodelist object
  *      NULL  : if $this->_dom equals NULL
  *      false : if error occurs
  * @desc select corresponding content use xpath
  */
 public function sel($strPath, $intIndex = NULL, $contextnode = NULL)
 {
     if ($this->_dom === NULL) {
         Phpfetcher_Log::warning('$this->_dom is NULL!');
         return NULL;
     }
     if ($contextnode !== NULL) {
         //$res = $this->_domxpath->query($strPath, $contextnode);
         Phpfetcher_Log::warning('param contextnode is no use because of this function\'s inability');
         $res = $this->_dom->sel($strPath, $intIndex);
     } else {
         //$res = $this->_domxpath->query($strPath);
         $res = $this->_dom->sel($strPath, $intIndex);
     }
     return $res;
 }
Beispiel #4
0
 /**
  * @author xuruiqi
  * @param
  *      array $arrInput : //运行设定
  *          string 'page_class_name' : //指定要使用的Page类型,必须是
  *                                     //Phpfetcher_Page_Abstract的
  *                                     //子类
  *          [array 'page_conf'] : //Page调用setConf时的输入参数,可选
  * @return
  *      obj $this
  * @desc
  */
 public function &run($arrInput = array())
 {
     if (empty($this->_arrFetchJobs)) {
         Phpfetcher_Log::warning("No fetch jobs.");
         return $this;
     }
     //构建Page对象
     $objPage = NULL;
     $strPageClassName = self::DEFAULT_PAGE_CLASS;
     if (!empty($arrInput['page_class_name'])) {
         $strPageClassName = strval($arrInput['page_class_name']);
     }
     try {
         if (!class_exists($strPageClassName, TRUE)) {
             throw new Exception("[{$strPageClassName}] class not exists!");
         }
         $objPage = new $strPageClassName();
         if (!$objPage instanceof Phpfetcher_Page_Abstract) {
             throw new Exception("[{$strPageClassName}] is not an instance of " . self::ABSTRACT_PAGE_CLASS);
         }
     } catch (Exception $e) {
         Phpfetcher_Log::fatal($e->getMessage());
         return $this;
     }
     //初始化Page对象
     $arrPageConf = empty($arrInput['page_conf']) ? array() : $arrInput['page_conf'];
     $objPage->init();
     if (!empty($arrPageConf)) {
         if (isset($arrPageConf['url'])) {
             unset($arrPageConf['url']);
         }
         $objPage->setConf($arrPageConf);
     }
     //遍历任务队列
     foreach ($this->_arrFetchJobs as $job_name => $job_rules) {
         if (!$this->_isJobValid($job_rules)) {
             Phpfetcher_Log::warning("Job rules invalid [" . serialize($job_rules) . "]");
             continue;
         }
         $intDepth = 0;
         $intPageNum = 0;
         $arrIndice = array(0, 1);
         $arrJobs = array(0 => array($job_rules['start_page']), 1 => array());
         //开始爬取
         while (!empty($arrJobs[$arrIndice[0]]) && ($job_rules['max_depth'] === -1 || $intDepth < $job_rules['max_depth']) && ($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) {
             $intDepth += 1;
             $intPopIndex = $arrIndice[0];
             $intPushIndex = $arrIndice[1];
             $arrJobs[$intPushIndex] = array();
             foreach ($arrJobs[$intPopIndex] as $url) {
                 if (!($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) {
                     break;
                 }
                 $objPage->setUrl($url);
                 $objPage->read();
                 //获取所有的超链接
                 $arrLinks = $objPage->getHyperLinks();
                 //匹配超链接
                 foreach ($job_rules['link_rules'] as $link_rule) {
                     foreach ($arrLinks as $link) {
                         if (preg_match($link_rule, $link) === 1 && !$this->getHash($link)) {
                             $this->setHash($link, true);
                             $arrJobs[$intPushIndex][] = $link;
                         }
                     }
                 }
                 //由用户实现handlePage函数
                 $objPage->setExtraInfo(array('job_name' => $job_name));
                 $this->handlePage($objPage);
                 $intPageNum += 1;
             }
             self::_swap($arrIndice[0], $arrIndice[1]);
         }
     }
     return $this;
 }