示例#1
0
 /**
  * Called when the crawler has crawled the given url.
  *
  * @param \SimZal\Crawler\Url                      $url
  * @param \Psr\Http\Message\ResponseInterface|null $response
  */
 public function hasBeenCrawled(Url $url, $response)
 {
     $webpage = Webpage::where('url', (string) $url)->first();
     if ($webpage) {
         $webpage->crawlcount++;
         $webpage->save();
     }
     SaveSnapshot::create($url, $response);
     // $html = (string)$response->getBody( true );
     //$response->getStatusCode()
     // https://guzzle3.readthedocs.org/http-client/response.html
 }
示例#2
0
 public function pageToCrawl()
 {
     $webpage = Webpage::orderBy('crawlcount', 'ASC')->limit(1)->first();
     if ($webpage) {
         $url = $webpage->url;
         $siteId = $webpage->site_id;
     } else {
         $site = Site::all()->random();
         $url = $site->url;
         $siteId = $site->id;
     }
     return compact('url', 'siteId');
 }
示例#3
0
 public static function create(Url $url, $response)
 {
     $webpage = Webpage::where('Url', (string) $url)->first();
     if (!$webpage) {
         return;
     }
     // dd( $webpage );
     $statusCode = $response ? $response->getStatusCode() : self::UNRESPONSIVE_HOST;
     $html = '';
     if (self::isHtml($response)) {
         $html = $response->getBody()->getContents();
         $html = FilterContent::make($html);
     }
     Snapshot::create(['html' => $html, 'status_code' => $statusCode, 'webpage_id' => $webpage->id, 'hash' => md5($html)]);
 }
示例#4
0
 public static function makeList($siteId, $search, $timeFilter, $typeFilter, $timeTypeFilter)
 {
     $list = [];
     $timeFunction = function ($query) use($timeFilter, $timeTypeFilter) {
         $isoDate = date('Y-m-d', strtotime("-{$timeFilter} days"));
         if ($timeTypeFilter == self::CHANGED_FILTER) {
             $query->where('snapshots.created_at', '>', $isoDate)->whereNotNull('snapshots.changes');
         } else {
             if ($timeTypeFilter == self::CREATED_FILTER) {
                 $query->where('webpages.created_at', '>', $isoDate);
             } else {
                 if ($timeTypeFilter == self::CHANGED_OR_CREATED_FILTER) {
                     $query->where('webpages.created_at', '>', $isoDate)->orWhere('snapshots.created_at', '>', $isoDate)->whereNotNull('snapshots.changes');
                 }
             }
         }
     };
     $filterFunction = function ($query) use($timeFilter, $timeTypeFilter, $typeFilter, $timeFunction) {
         if ($timeFilter) {
             $query->where($timeFunction);
         }
         if ($typeFilter == 'pdf') {
             $query->where('webpages.url', 'LIKE', '%.pdf');
         } else {
             if ($typeFilter == 'html') {
                 $query->where('webpages.url', 'NOT LIKE', '%.pdf');
             }
         }
         //fb( $query->toSql() );
     };
     $searchFunction = function ($query) use($search) {
         $query->where('snapshots.html', 'LIKE', "%{$search}%")->orWhere('webpages.url', 'LIKE', "%{$search}%");
     };
     // END CLOSURES
     $query = Webpage::join('snapshots', 'snapshots.webpage_id', '=', 'webpages.id')->whereSiteId($siteId)->where($filterFunction)->orderBy('url', 'asc');
     $query->where('webpages.last_status_code', $timeTypeFilter == self::DELETED_FILTER ? '!=' : '=', '200');
     if ($search) {
         $query = $query->where($searchFunction)->remember(5000)->take(3000);
         $webpages = $query->get(['webpages.*'])->unique();
     } else {
         $webpages = $query->get(['webpages.*'])->unique();
     }
     foreach ($webpages as $key => $webpage) {
         $webpages[$key]->level = $search ? 1 : substr_count($webpage->url, '/') - 2;
     }
     return $webpages;
 }
示例#5
0
 /**
  * Display a listing of the resource.
  *
  * @return \Illuminate\Http\Response
  */
 public function index(Request $request)
 {
     $webpages = Webpage::where('site_id', $request->input('site_id'))->get();
     return view('webpage.index', compact('webpages'));
 }
示例#6
0
 public function create($siteId, Url $url)
 {
     // dd( $url->host );
     $webpage = Webpage::firstOrCreate(['url' => (string) $url, 'site_id' => $siteId]);
 }