/** * 自动采集 ,根据mysql里的url记录 */ public function autoCollect() { set_time_limit(0); $countNum = 1000; $commonCurl = new CommonCurl(); for ($i = 0; $i < $countNum; $i++) { $crawlerUrl = $this->getNotCollectCrawlerUrl(); if (empty($crawlerUrl['url'])) { break; } try { $htmlString = $commonCurl->baseCurl($crawlerUrl['url']); $htmlString = $commonCurl->formatEncodingUtf8($htmlString); $urls = $this->findA($htmlString); if (!empty($urls)) { $this->addCrawlerUrls($crawlerUrl['url'], $urls); } $content = $this->findContent($htmlString); $crawlerUrl->request_status = 2; $crawlerUrl->analyze_status = 2; $crawlerUrl->save(); $this->addCrawlerUrlContent($crawlerUrl['id'], $content); } catch (Exception $exp) { //请求失败; $crawlerUrl->request_status = 3; $crawlerUrl->desc = $exp->getMessage(); $crawlerUrl->save(); } } return true; }
/** * 获取url链接页面里的 url链接 * @param $request_url * @return array * @throws Exception * @throws \Exception */ public function urlList($request_url) { $commonCurl = new CommonCurl(); // $commonCrawler = new CommonCrawler(); $commonCrawler = new BudejieCrawler(); $file = Yii::$app->runtimePath . '/curl/1.txt'; $commonCurl->baseCurlFile($request_url, $file); $htmlString = file_get_contents($file); $htmlString = $commonCurl->formatEncodingUtf8($htmlString); $res = $commonCrawler->findA($htmlString); $commonCrawler->addCrawlerUrls($request_url, $res); return $res; }