public function grab() { $weibos = WeiBo::whereStatus(0)->get(); $count = count($weibos); foreach ($weibos as $key => $weibo) { $this->comment($weibo->url . " {$key}/{$count}"); $crawler = new Crawler(); $crawler->get($weibo->url)->startfilter(); $file = fopen('./test.html', 'w'); fwrite($file, $crawler->getBody()); $weibo->name = $crawler->filter('h1.username')->text(); $weibo->save(); } }
public function grap() { $offset = $this->option('offset'); $limit = $this->option('limit'); while (true) { $query = ZhiHu::where('status', 0); $offset and $query = $query->skip($offset); $limit and $query = $query->take($limit); $zhihus = $query->get(); foreach ($zhihus as $zhihu) { $craw = new Crawler(); $url = $zhihu->url; $this->info($url); $craw->get($url)->startFilter(); $titleNode = $craw->filter('h2.zm-item-title'); if (count($titleNode)) { $zhihu->title = trim($titleNode->text()); } else { continue; } $zhihu->content = $craw->filter('div.zm-editable-content')->text(); $answerNode = $craw->filter('h3#zh-question-answer-num'); $zhihu->answer_num = count($answerNode) ? $answerNode->attr('data-num') : 0; $concerned_num = $craw->filter('div#zh-question-side-header-wrap')->text(); preg_match('/\\d+/', $concerned_num, $matchs); $zhihu->concerned_num = isset($matchs[0]) ? $matchs[0] : 0; // $viewNode = $craw->filter('.zm-side-section-inner .zg-gray-normal')->last()->text(); // dd($viewNode); // $zhihu->views = count($viewNode)?$viewNode->text():0; // dd($zhihu->views); $zhihu->status = 1; $zhihu->save(); $this->comment('answer----conserned: ' . $zhihu->answer_num . '---' . $zhihu->concerned_num); $links = $craw->filter('a.question_link'); if (count($links)) { $links->each(function ($node) { $link = 'http://www.zhihu.com' . $node->attr('href'); if (!ZhiHu::where('url', $link)->first()) { $this->question($link); ZhiHu::saveData(['url' => $link]); } }); } $userLinks = $craw->filter('a.author-link'); if (count($userLinks)) { $userLinks->each(function ($node) { $link = 'http://www.zhihu.com' . $node->attr('href'); ZhiHuUser::firstOrCreate(['url' => $link], ['url' => $link]); }); } } } }
public function grap() { $offset = $this->option('offset'); $limit = $this->option('limit'); while (true) { $query = ZhiHuUser::whereStatus(0)->orderBy('created_at', 'desc'); $offset and $query = $query->skip($offset); $limit and $query = $query->take($limit); $zhihus = $query->get(); $count = count($zhihus); $start_time = time(); foreach ($zhihus as $key => $user) { $craw = new Crawler(); $url = $user->url; $this->info($url . " {$count}/{$key}"); $craw->get($url)->startFilter(); // $loginNode = $craw->filter('span.name'); // if(count($loginNode)){ // $this->error('被防抓取'); // dd($loginNode->text()); // continue; // } $nameNode = $craw->filter('span.name'); if (!count($nameNode)) { $user->status = -2; $user->save(); $this->error('this user is die'); continue; } $user->name = $nameNode->text(); $cityNode = $craw->filter('span.location'); $genderNode = $craw->filter('span.gender i'); if (count($genderNode)) { if (strstr($genderNode->attr('class'), 'female')) { $user->gender = 2; } else { $user->gender = 1; } // elseif(strstr($genderNode->attr('class'),'female'); } $user->city = count($cityNode) ? $cityNode->attr('title') : ''; $jobNode = $craw->filter('span.business'); $user->job = count($jobNode) ? $jobNode->attr('title') : ''; $descNode = $craw->filter('span.description'); $user->desc = count($descNode) ? trim($descNode->text()) : ''; $user->be_favor = $craw->filter('span.zm-profile-header-user-agree strong')->text(); $user->be_thank = $craw->filter('span.zm-profile-header-user-thanks strong')->text(); $user->asks = $craw->filter('div.profile-navbar a')->eq(1)->filter('span.num')->text(); $user->answers = $craw->filter('div.profile-navbar a')->eq(2)->filter('span.num')->text(); $user->concerned = $craw->filter('div.zm-profile-side-following a')->eq(0)->filter('strong')->text(); $user->be_concerned = $craw->filter('div.zm-profile-side-following a')->eq(1)->filter('strong')->text(); $user->status = 1; $user->save(); if ((time() - $start_time) % 300 == 0) { sleep(30); } } } }