示例#1
0
 /**
  * 采集内容
  * @return void
  */
 public function getContents()
 {
     /* 检查初始化状态 */
     $this->checkIsInited();
     /* 所有入口链接 */
     foreach ($this->urls as $url) {
         $this->collectorParser->simplifyUrl($url);
         /* 初始化分页正文内容容器 */
         $paged_main_content = [];
         /* 获得内容入口页面内容 */
         if (($result = $this->getResult($url)) !== false) {
             /* 判断采集需要采集正文内容 */
             if (isset($result[self::MAIN_CONTENT_SELECTOR_ID])) {
                 /* 保存第一页内容 */
                 $paged_main_content[] = $result[self::MAIN_CONTENT_SELECTOR_ID];
                 try {
                     $page_urls = $this->collectorParser->getContentPages($this->getHtml(), $this->getHtmlDom());
                 } catch (Exception $e) {
                     if ($this->contentPageMode === self::PAGES_INLINE) {
                         $page_urls = $this->getContentInlinePages($this->getHtml(), $this->getHtmlDom(), $this->contentPagesSelector);
                     }
                 }
                 /* 内部查找实例,用于查找分页其他内容 */
                 $mainContentFinder = new Finder();
                 $mainContentFinder->addSelector(self::MAIN_CONTENT_SELECTOR_ID, $this->contentSelector);
                 while (count($page_urls) > 0) {
                     $page_url = array_shift($page_urls);
                     $page_result = $mainContentFinder->getResult($this->collectorParser->changeUrl($page_url));
                     if ($page_result[self::MAIN_CONTENT_SELECTOR_ID]) {
                         $paged_main_content[] = $result[self::MAIN_CONTENT_SELECTOR_ID];
                     }
                     if (get_class($this->collectorParser) === __NAMESPACE__ . '\\CollectorParser' && $this->contentPageMode === self::PAGES_CONTEXT) {
                         if ($next_url = $this->collectorParser->getContentContextPage($this->getHtml(), $this->getHtmlDom(), $this->contentPagesSelector)) {
                             $page_urls[] = $next_url;
                         }
                     }
                 }
                 unset($page_url, $next_url, $page_result, $result[self::MAIN_CONTENT_SELECTOR_ID]);
             }
             /* 处理替换工作 */
             foreach ($result as $key => &$item) {
                 if ($selector = $this->getSelector($key)) {
                     $item = $this->replaceString($item, $selector);
                 }
             }
             unset($key, $item);
             /* 遍历分页内容,并替换字符串 */
             foreach ($paged_main_content as &$content) {
                 if ($this->contentSelector) {
                     $content = $this->replaceString($content, $this->contentSelector);
                 }
             }
             unset($content);
             if ($paged_main_content) {
                 /* 创建闭包函数需要的实例 */
                 $collectorParser = $this->collectorParser;
                 $pictureMaker = $this->pictureMaker;
                 $downloadPicture = $this->downloadPicture;
                 $content_pictures = [];
                 /* 初始化正文图片容器,用于返回给监听器 */
                 /* 替换正文内容的图片地址为采集后地址 */
                 foreach ($paged_main_content as &$content) {
                     $paged_content_pictures = [];
                     /* 初始化存储每页图片容器,用于返回给监听器 */
                     $content = preg_replace_callback('/<img\\s[^>]*\\ssrc="([^>]+?)"\\s[^>]*\\/?>/i', function ($match) use($collectorParser, $pictureMaker, &$content_pictures, &$paged_content_pictures, $downloadPicture) {
                         /* 补全图片链接 */
                         $pic_url = $collectorParser->changeUrl($match[1]);
                         /* 如果需要下载图片,则替换为目标地址 */
                         if ($downloadPicture) {
                             $pic_url = $pictureMaker->getUrl($pic_url);
                         }
                         $content_pictures[] = $pic_url;
                         $paged_content_pictures[] = $pic_url;
                         return '<img src="' . $pic_url . '" />';
                     }, $content);
                     $this->dispatch('collect_paged_main_content_success', $url, $content, $paged_content_pictures);
                 }
                 unset($collectorParser, $pictureMaker, $downloadPicture, $content, $paged_content_pictures);
                 /* 替换别名 */
                 $result[$this->contentSelectorIDAlias] = Helper::formatContent(implode('', $paged_main_content));
             }
             $this->dispatch('collect_content_success', $url, $result, $content_pictures);
         } else {
             $this->dispatch('collect_content_fail', $url);
         }
     }
     unset($url);
     if ($this->downloadPicture) {
         /* 开始下载图片 */
         $this->pictureMaker->download();
     }
 }