/** * Called when the crawler has crawled the given url. * * @param \SimZal\Crawler\Url $url * @param \Psr\Http\Message\ResponseInterface|null $response */ public function hasBeenCrawled(Url $url, $response) { $webpage = Webpage::where('url', (string) $url)->first(); if ($webpage) { $webpage->crawlcount++; $webpage->save(); } SaveSnapshot::create($url, $response); // $html = (string)$response->getBody( true ); //$response->getStatusCode() // https://guzzle3.readthedocs.org/http-client/response.html }
public function pageToCrawl() { $webpage = Webpage::orderBy('crawlcount', 'ASC')->limit(1)->first(); if ($webpage) { $url = $webpage->url; $siteId = $webpage->site_id; } else { $site = Site::all()->random(); $url = $site->url; $siteId = $site->id; } return compact('url', 'siteId'); }
public static function create(Url $url, $response) { $webpage = Webpage::where('Url', (string) $url)->first(); if (!$webpage) { return; } // dd( $webpage ); $statusCode = $response ? $response->getStatusCode() : self::UNRESPONSIVE_HOST; $html = ''; if (self::isHtml($response)) { $html = $response->getBody()->getContents(); $html = FilterContent::make($html); } Snapshot::create(['html' => $html, 'status_code' => $statusCode, 'webpage_id' => $webpage->id, 'hash' => md5($html)]); }
public static function makeList($siteId, $search, $timeFilter, $typeFilter, $timeTypeFilter) { $list = []; $timeFunction = function ($query) use($timeFilter, $timeTypeFilter) { $isoDate = date('Y-m-d', strtotime("-{$timeFilter} days")); if ($timeTypeFilter == self::CHANGED_FILTER) { $query->where('snapshots.created_at', '>', $isoDate)->whereNotNull('snapshots.changes'); } else { if ($timeTypeFilter == self::CREATED_FILTER) { $query->where('webpages.created_at', '>', $isoDate); } else { if ($timeTypeFilter == self::CHANGED_OR_CREATED_FILTER) { $query->where('webpages.created_at', '>', $isoDate)->orWhere('snapshots.created_at', '>', $isoDate)->whereNotNull('snapshots.changes'); } } } }; $filterFunction = function ($query) use($timeFilter, $timeTypeFilter, $typeFilter, $timeFunction) { if ($timeFilter) { $query->where($timeFunction); } if ($typeFilter == 'pdf') { $query->where('webpages.url', 'LIKE', '%.pdf'); } else { if ($typeFilter == 'html') { $query->where('webpages.url', 'NOT LIKE', '%.pdf'); } } //fb( $query->toSql() ); }; $searchFunction = function ($query) use($search) { $query->where('snapshots.html', 'LIKE', "%{$search}%")->orWhere('webpages.url', 'LIKE', "%{$search}%"); }; // END CLOSURES $query = Webpage::join('snapshots', 'snapshots.webpage_id', '=', 'webpages.id')->whereSiteId($siteId)->where($filterFunction)->orderBy('url', 'asc'); $query->where('webpages.last_status_code', $timeTypeFilter == self::DELETED_FILTER ? '!=' : '=', '200'); if ($search) { $query = $query->where($searchFunction)->remember(5000)->take(3000); $webpages = $query->get(['webpages.*'])->unique(); } else { $webpages = $query->get(['webpages.*'])->unique(); } foreach ($webpages as $key => $webpage) { $webpages[$key]->level = $search ? 1 : substr_count($webpage->url, '/') - 2; } return $webpages; }
/** * Display a listing of the resource. * * @return \Illuminate\Http\Response */ public function index(Request $request) { $webpages = Webpage::where('site_id', $request->input('site_id'))->get(); return view('webpage.index', compact('webpages')); }
public function create($siteId, Url $url) { // dd( $url->host ); $webpage = Webpage::firstOrCreate(['url' => (string) $url, 'site_id' => $siteId]); }