Пример #1
0
 public function grab()
 {
     $weibos = WeiBo::whereStatus(0)->get();
     $count = count($weibos);
     foreach ($weibos as $key => $weibo) {
         $this->comment($weibo->url . "      {$key}/{$count}");
         $crawler = new Crawler();
         $crawler->get($weibo->url)->startfilter();
         $file = fopen('./test.html', 'w');
         fwrite($file, $crawler->getBody());
         $weibo->name = $crawler->filter('h1.username')->text();
         $weibo->save();
     }
 }
Пример #2
0
 public function grap()
 {
     $offset = $this->option('offset');
     $limit = $this->option('limit');
     while (true) {
         $query = ZhiHu::where('status', 0);
         $offset and $query = $query->skip($offset);
         $limit and $query = $query->take($limit);
         $zhihus = $query->get();
         foreach ($zhihus as $zhihu) {
             $craw = new Crawler();
             $url = $zhihu->url;
             $this->info($url);
             $craw->get($url)->startFilter();
             $titleNode = $craw->filter('h2.zm-item-title');
             if (count($titleNode)) {
                 $zhihu->title = trim($titleNode->text());
             } else {
                 continue;
             }
             $zhihu->content = $craw->filter('div.zm-editable-content')->text();
             $answerNode = $craw->filter('h3#zh-question-answer-num');
             $zhihu->answer_num = count($answerNode) ? $answerNode->attr('data-num') : 0;
             $concerned_num = $craw->filter('div#zh-question-side-header-wrap')->text();
             preg_match('/\\d+/', $concerned_num, $matchs);
             $zhihu->concerned_num = isset($matchs[0]) ? $matchs[0] : 0;
             // $viewNode = $craw->filter('.zm-side-section-inner .zg-gray-normal')->last()->text();
             // dd($viewNode);
             // $zhihu->views = count($viewNode)?$viewNode->text():0;
             // dd($zhihu->views);
             $zhihu->status = 1;
             $zhihu->save();
             $this->comment('answer----conserned:   ' . $zhihu->answer_num . '---' . $zhihu->concerned_num);
             $links = $craw->filter('a.question_link');
             if (count($links)) {
                 $links->each(function ($node) {
                     $link = 'http://www.zhihu.com' . $node->attr('href');
                     if (!ZhiHu::where('url', $link)->first()) {
                         $this->question($link);
                         ZhiHu::saveData(['url' => $link]);
                     }
                 });
             }
             $userLinks = $craw->filter('a.author-link');
             if (count($userLinks)) {
                 $userLinks->each(function ($node) {
                     $link = 'http://www.zhihu.com' . $node->attr('href');
                     ZhiHuUser::firstOrCreate(['url' => $link], ['url' => $link]);
                 });
             }
         }
     }
 }
Пример #3
0
 public function grap()
 {
     $offset = $this->option('offset');
     $limit = $this->option('limit');
     while (true) {
         $query = ZhiHuUser::whereStatus(0)->orderBy('created_at', 'desc');
         $offset and $query = $query->skip($offset);
         $limit and $query = $query->take($limit);
         $zhihus = $query->get();
         $count = count($zhihus);
         $start_time = time();
         foreach ($zhihus as $key => $user) {
             $craw = new Crawler();
             $url = $user->url;
             $this->info($url . "        {$count}/{$key}");
             $craw->get($url)->startFilter();
             // $loginNode = $craw->filter('span.name');
             // if(count($loginNode)){
             //     $this->error('被防抓取');
             //     dd($loginNode->text());
             //     continue;
             // }
             $nameNode = $craw->filter('span.name');
             if (!count($nameNode)) {
                 $user->status = -2;
                 $user->save();
                 $this->error('this user is die');
                 continue;
             }
             $user->name = $nameNode->text();
             $cityNode = $craw->filter('span.location');
             $genderNode = $craw->filter('span.gender i');
             if (count($genderNode)) {
                 if (strstr($genderNode->attr('class'), 'female')) {
                     $user->gender = 2;
                 } else {
                     $user->gender = 1;
                 }
                 // elseif(strstr($genderNode->attr('class'),'female');
             }
             $user->city = count($cityNode) ? $cityNode->attr('title') : '';
             $jobNode = $craw->filter('span.business');
             $user->job = count($jobNode) ? $jobNode->attr('title') : '';
             $descNode = $craw->filter('span.description');
             $user->desc = count($descNode) ? trim($descNode->text()) : '';
             $user->be_favor = $craw->filter('span.zm-profile-header-user-agree strong')->text();
             $user->be_thank = $craw->filter('span.zm-profile-header-user-thanks strong')->text();
             $user->asks = $craw->filter('div.profile-navbar a')->eq(1)->filter('span.num')->text();
             $user->answers = $craw->filter('div.profile-navbar a')->eq(2)->filter('span.num')->text();
             $user->concerned = $craw->filter('div.zm-profile-side-following a')->eq(0)->filter('strong')->text();
             $user->be_concerned = $craw->filter('div.zm-profile-side-following a')->eq(1)->filter('strong')->text();
             $user->status = 1;
             $user->save();
             if ((time() - $start_time) % 300 == 0) {
                 sleep(30);
             }
         }
     }
 }