Пример #1
0
 /**
  * 自动采集 ,根据mysql里的url记录
  */
 public function autoCollect()
 {
     set_time_limit(0);
     $countNum = 1000;
     $commonCurl = new CommonCurl();
     for ($i = 0; $i < $countNum; $i++) {
         $crawlerUrl = $this->getNotCollectCrawlerUrl();
         if (empty($crawlerUrl['url'])) {
             break;
         }
         try {
             $htmlString = $commonCurl->baseCurl($crawlerUrl['url']);
             $htmlString = $commonCurl->formatEncodingUtf8($htmlString);
             $urls = $this->findA($htmlString);
             if (!empty($urls)) {
                 $this->addCrawlerUrls($crawlerUrl['url'], $urls);
             }
             $content = $this->findContent($htmlString);
             $crawlerUrl->request_status = 2;
             $crawlerUrl->analyze_status = 2;
             $crawlerUrl->save();
             $this->addCrawlerUrlContent($crawlerUrl['id'], $content);
         } catch (Exception $exp) {
             //请求失败;
             $crawlerUrl->request_status = 3;
             $crawlerUrl->desc = $exp->getMessage();
             $crawlerUrl->save();
         }
     }
     return true;
 }
Пример #2
0
 /**
  * 获取url链接页面里的 url链接
  * @param $request_url
  * @return array
  * @throws Exception
  * @throws \Exception
  */
 public function urlList($request_url)
 {
     $commonCurl = new CommonCurl();
     //        $commonCrawler = new CommonCrawler();
     $commonCrawler = new BudejieCrawler();
     $file = Yii::$app->runtimePath . '/curl/1.txt';
     $commonCurl->baseCurlFile($request_url, $file);
     $htmlString = file_get_contents($file);
     $htmlString = $commonCurl->formatEncodingUtf8($htmlString);
     $res = $commonCrawler->findA($htmlString);
     $commonCrawler->addCrawlerUrls($request_url, $res);
     return $res;
 }