public function insert($str) { try { $str = strval($str); $intLen = strlen($str); $arrCurNode =& $this->_arrTrieRoot; for ($i = 0; $i < $intLen; ++$i) { if (!isset($arrCurNode['children'][$str[$i]])) { $arrCurNode['children'][$str[$i]] = array('children' => array(), 'count' => 0); } $arrCurNode =& $arrCurNode['children'][$str[$i]]; } $arrCurNode['count'] += 1; unset($arrCurNode); } catch (Exception $e) { Phpfetcher_Log::fatal($e->getMessage()); return false; } return true; }
/** * @author xuruiqi * @param * array $arrInput : //运行设定 * string 'page_class_name' : //指定要使用的Page类型,必须是 * //Phpfetcher_Page_Abstract的 * //子类 * [array 'page_conf'] : //Page调用setConf时的输入参数,可选 * @return * obj $this * @desc */ public function &run($arrInput = array()) { if (empty($this->_arrFetchJobs)) { Phpfetcher_Log::warning("No fetch jobs."); return $this; } //构建Page对象 $objPage = NULL; $strPageClassName = self::DEFAULT_PAGE_CLASS; if (!empty($arrInput['page_class_name'])) { $strPageClassName = strval($arrInput['page_class_name']); } try { if (!class_exists($strPageClassName, TRUE)) { throw new Exception("[{$strPageClassName}] class not exists!"); } $objPage = new $strPageClassName(); if (!$objPage instanceof Phpfetcher_Page_Abstract) { throw new Exception("[{$strPageClassName}] is not an instance of " . self::ABSTRACT_PAGE_CLASS); } } catch (Exception $e) { Phpfetcher_Log::fatal($e->getMessage()); return $this; } //初始化Page对象 $arrPageConf = empty($arrInput['page_conf']) ? array() : $arrInput['page_conf']; $objPage->init(); if (!empty($arrPageConf)) { if (isset($arrPageConf['url'])) { unset($arrPageConf['url']); } $objPage->setConf($arrPageConf); } //遍历任务队列 foreach ($this->_arrFetchJobs as $job_name => $job_rules) { if (!$this->_isJobValid($job_rules)) { Phpfetcher_Log::warning("Job rules invalid [" . serialize($job_rules) . "]"); continue; } $intDepth = 0; $intPageNum = 0; $arrIndice = array(0, 1); $arrJobs = array(0 => array($job_rules['start_page']), 1 => array()); //开始爬取 while (!empty($arrJobs[$arrIndice[0]]) && ($job_rules['max_depth'] === -1 || $intDepth < $job_rules['max_depth']) && ($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) { $intDepth += 1; $intPopIndex = $arrIndice[0]; $intPushIndex = $arrIndice[1]; $arrJobs[$intPushIndex] = array(); foreach ($arrJobs[$intPopIndex] as $url) { if (!($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) { break; } $objPage->setUrl($url); $objPage->read(); //获取所有的超链接 $arrLinks = $objPage->getHyperLinks(); //解析当前URL的各个组成部分,以应对超链接中存在站内链接 //的情况,如"/entry"等形式的URL $strCurUrl = $objPage->getUrl(); $arrUrlComponents = parse_url($strCurUrl); //匹配超链接 foreach ($job_rules['link_rules'] as $link_rule) { foreach ($arrLinks as $link) { //if (preg_match($link_rule, $link) === 1 // && !$this->getHash($link)) { // $this->setHash($link, true); // $arrJobs[$intPushIndex][] = $link; //} if (preg_match($link_rule, $link) === 1 && !$this->getHash($link)) { //拼出实际的URL $real_link = $link; //不使用strpos,防止扫描整个字符串 //这里只需要扫描前6个字符即可 $colon_pos = false; for ($i = 0; $i <= 5; ++$i) { if ($link[$i] == ':') { $colon_pos = $i; break; } } if ($colon_pos === false || !$this->_objSchemeTrie->has(substr($link, 0, $colon_pos))) { //将站内地址转换为完整地址 $real_link = $arrUrlComponents['scheme'] . "://" . $arrUrlComponents['host'] . (isset($arrUrlComponents['port']) && strlen($arrUrlComponents['port']) != 0 ? ":{$arrUrlComponents['port']}" : "") . ($link[0] == '/' ? $link : "/{$link}"); } $this->setHash($link, true); $this->setHash($real_link, true); $arrJobs[$intPushIndex][] = $real_link; } } } //由用户实现handlePage函数 $objPage->setExtraInfo(array('job_name' => $job_name)); $this->handlePage($objPage); $intPageNum += 1; } if (!empty($this->_arrAdditionalUrls)) { $arrJobs[$intPushIndex] = array_merge($arrJobs[$intPushIndex], $this->_arrAdditionalUrls); $this->_arrAdditionalUrls = array(); } self::_swap($arrIndice[0], $arrIndice[1]); } } return $this; }
/** * @author xuruiqi * @param * array $arrInput : //运行设定 * string 'page_class_name' : //指定要使用的Page类型,必须是 * //Phpfetcher_Page_Abstract的 * //子类 * [array 'page_conf'] : //Page调用setConf时的输入参数,可选 * @return * obj $this * @desc */ public function &run($arrInput = array()) { if (empty($this->_arrFetchJobs)) { Phpfetcher_Log::warning("No fetch jobs."); return $this; } //构建Page对象 $objPage = NULL; $strPageClassName = self::DEFAULT_PAGE_CLASS; if (!empty($arrInput['page_class_name'])) { $strPageClassName = strval($arrInput['page_class_name']); } try { if (!class_exists($strPageClassName, TRUE)) { throw new Exception("[{$strPageClassName}] class not exists!"); } $objPage = new $strPageClassName(); if (!$objPage instanceof Phpfetcher_Page_Abstract) { throw new Exception("[{$strPageClassName}] is not an instance of " . self::ABSTRACT_PAGE_CLASS); } } catch (Exception $e) { Phpfetcher_Log::fatal($e->getMessage()); return $this; } //初始化Page对象 $arrPageConf = empty($arrInput['page_conf']) ? array() : $arrInput['page_conf']; $objPage->init(); if (!empty($arrPageConf)) { if (isset($arrPageConf['url'])) { unset($arrPageConf['url']); } $objPage->setConf($arrPageConf); } //遍历任务队列 foreach ($this->_arrFetchJobs as $job_name => $job_rules) { if (!$this->_isJobValid($job_rules)) { Phpfetcher_Log::warning("Job rules invalid [" . serialize($job_rules) . "]"); continue; } $intDepth = 0; $intPageNum = 0; $arrIndice = array(0, 1); $arrJobs = array(0 => array($job_rules['start_page']), 1 => array()); //开始爬取 while (!empty($arrJobs[$arrIndice[0]]) && ($job_rules['max_depth'] === -1 || $intDepth < $job_rules['max_depth']) && ($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) { $intDepth += 1; $intPopIndex = $arrIndice[0]; $intPushIndex = $arrIndice[1]; $arrJobs[$intPushIndex] = array(); foreach ($arrJobs[$intPopIndex] as $url) { if (!($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) { break; } $objPage->setUrl($url); $objPage->read(); //获取所有的超链接 $arrLinks = $objPage->getHyperLinks(); //匹配超链接 foreach ($job_rules['link_rules'] as $link_rule) { foreach ($arrLinks as $link) { if (preg_match($link_rule, $link) === 1 && !$this->getHash($link)) { $this->setHash($link, true); $arrJobs[$intPushIndex][] = $link; } } } //由用户实现handlePage函数 $objPage->setExtraInfo(array('job_name' => $job_name)); $this->handlePage($objPage); $intPageNum += 1; } self::_swap($arrIndice[0], $arrIndice[1]); } } return $this; }