/** * 采集内容 * @return void */ public function getContents() { /* 检查初始化状态 */ $this->checkIsInited(); /* 所有入口链接 */ foreach ($this->urls as $url) { $this->collectorParser->simplifyUrl($url); /* 初始化分页正文内容容器 */ $paged_main_content = []; /* 获得内容入口页面内容 */ if (($result = $this->getResult($url)) !== false) { /* 判断采集需要采集正文内容 */ if (isset($result[self::MAIN_CONTENT_SELECTOR_ID])) { /* 保存第一页内容 */ $paged_main_content[] = $result[self::MAIN_CONTENT_SELECTOR_ID]; try { $page_urls = $this->collectorParser->getContentPages($this->getHtml(), $this->getHtmlDom()); } catch (Exception $e) { if ($this->contentPageMode === self::PAGES_INLINE) { $page_urls = $this->getContentInlinePages($this->getHtml(), $this->getHtmlDom(), $this->contentPagesSelector); } } /* 内部查找实例,用于查找分页其他内容 */ $mainContentFinder = new Finder(); $mainContentFinder->addSelector(self::MAIN_CONTENT_SELECTOR_ID, $this->contentSelector); while (count($page_urls) > 0) { $page_url = array_shift($page_urls); $page_result = $mainContentFinder->getResult($this->collectorParser->changeUrl($page_url)); if ($page_result[self::MAIN_CONTENT_SELECTOR_ID]) { $paged_main_content[] = $result[self::MAIN_CONTENT_SELECTOR_ID]; } if (get_class($this->collectorParser) === __NAMESPACE__ . '\\CollectorParser' && $this->contentPageMode === self::PAGES_CONTEXT) { if ($next_url = $this->collectorParser->getContentContextPage($this->getHtml(), $this->getHtmlDom(), $this->contentPagesSelector)) { $page_urls[] = $next_url; } } } unset($page_url, $next_url, $page_result, $result[self::MAIN_CONTENT_SELECTOR_ID]); } /* 处理替换工作 */ foreach ($result as $key => &$item) { if ($selector = $this->getSelector($key)) { $item = $this->replaceString($item, $selector); } } unset($key, $item); /* 遍历分页内容,并替换字符串 */ foreach ($paged_main_content as &$content) { if ($this->contentSelector) { $content = $this->replaceString($content, $this->contentSelector); } } unset($content); if ($paged_main_content) { /* 创建闭包函数需要的实例 */ $collectorParser = $this->collectorParser; $pictureMaker = $this->pictureMaker; $downloadPicture = $this->downloadPicture; $content_pictures = []; /* 初始化正文图片容器,用于返回给监听器 */ /* 替换正文内容的图片地址为采集后地址 */ foreach ($paged_main_content as &$content) { $paged_content_pictures = []; /* 初始化存储每页图片容器,用于返回给监听器 */ $content = preg_replace_callback('/<img\\s[^>]*\\ssrc="([^>]+?)"\\s[^>]*\\/?>/i', function ($match) use($collectorParser, $pictureMaker, &$content_pictures, &$paged_content_pictures, $downloadPicture) { /* 补全图片链接 */ $pic_url = $collectorParser->changeUrl($match[1]); /* 如果需要下载图片,则替换为目标地址 */ if ($downloadPicture) { $pic_url = $pictureMaker->getUrl($pic_url); } $content_pictures[] = $pic_url; $paged_content_pictures[] = $pic_url; return '<img src="' . $pic_url . '" />'; }, $content); $this->dispatch('collect_paged_main_content_success', $url, $content, $paged_content_pictures); } unset($collectorParser, $pictureMaker, $downloadPicture, $content, $paged_content_pictures); /* 替换别名 */ $result[$this->contentSelectorIDAlias] = Helper::formatContent(implode('', $paged_main_content)); } $this->dispatch('collect_content_success', $url, $result, $content_pictures); } else { $this->dispatch('collect_content_fail', $url); } } unset($url); if ($this->downloadPicture) { /* 开始下载图片 */ $this->pictureMaker->download(); } }