Пример #1
0
 public static function grabLinks()
 {
     $client = new Client();
     $baseUrl = 'http://olx.ua/nedvizhimost/cri/?page=%d';
     $page = 1;
     $retryCount = 5;
     do {
         $time = time();
         $response = null;
         try {
             $response = $client->get(sprintf($baseUrl, $page++));
         } catch (ServerException $e) {
             Log::error('OLX server error', ['code' => $e->getCode(), 'message' => $e->getMessage()]);
             return;
         }
         $urls = [];
         if ($response) {
             $crawler = new Crawler($response->getBody()->getContents());
             $urls = array_map(function ($link) {
                 return preg_replace('/#.*$/', '', $link);
             }, $crawler->filter('td.offer a.link')->extract('href'));
         }
         if (0 === count($urls)) {
             Log::info('Grabbed 0 OLX links');
         }
         $grabbedCount = 0;
         $droppedCount = 0;
         /** @var Dispatcher $dispatcher */
         $dispatcher = app('Illuminate\\Bus\\Dispatcher');
         foreach ($urls as $url) {
             /** @var GrabbedUrl $grabbedUrl */
             $grabbedUrl = GrabbedUrl::where('url', $url)->first();
             if ($grabbedUrl) {
                 /** @var GrabbedUrl $grabbedUrl */
                 $grabbedUrl->count = $grabbedUrl->count + 1;
                 $grabbedUrl->save();
                 $droppedCount++;
             } else {
                 $grabbedUrl = GrabbedUrl::create(['url' => $url]);
                 $dispatcher->dispatch(new Parse($grabbedUrl));
                 $grabbedCount++;
             }
         }
         printf('[%d] grabbed: %d, dropped: %d' . PHP_EOL, time() - $time, $grabbedCount, $droppedCount);
     } while ($grabbedCount > 0 or $retryCount-- > 0);
 }
Пример #2
0
 /**
  * Execute the job.
  *
  * @return void
  */
 public function handle(ParserLoggerInterface $logger, Client $client)
 {
     $context = ['url' => $this->url->url];
     try {
         $response = $client->get($this->url->url, ['allow_redirects' => false]);
     } catch (ClientException $e) {
         $logger->error('Client error', self::arrayInsert($context, 'exception', $e));
         throw $e;
     }
     if (200 !== ($code = $response->getStatusCode())) {
         switch ($code) {
             case 301:
                 if (preg_match('/from404/', $response->getHeaderLine('Location'))) {
                     $logger->warning('Offer deleted', $context);
                 } else {
                     $grabbedUrl = null;
                     try {
                         $grabbedUrl = GrabbedUrl::create(['url' => $response->getHeaderLine('Location')]);
                     } catch (QueryException $e) {
                         return;
                     }
                     if ($grabbedUrl) {
                         $this->dispatch(new Parse($grabbedUrl));
                         $logger->info('Dispatched new job', self::arrayInsert($context, 'new_url', $response->getHeaderLine('Location')));
                     }
                 }
                 return;
             default:
                 $logger->error('Invalid response code ' . $code, $context);
         }
     }
     $responseHtml = $response->getBody()->getContents();
     $crawler = new Crawler($responseHtml);
     $offer = new Offer();
     $offer->href = $this->url->url;
     $requiredFieldsCommands = ['phones' => function () use($client) {
         return self::getPhones($client, self::getOlxId($this->url->url));
     }, 'cat_path' => function () use($responseHtml) {
         return self::getCatPath($responseHtml);
     }];
     foreach ($requiredFieldsCommands as $field => $command) {
         try {
             $offer->{$field} = $command();
         } catch (\Exception $e) {
             $logger->error('Unable to extract required fields', self::arrayInsert($context, 'required_field', $field));
             throw $e;
         }
     }
     $fieldsCommands = ['price_string' => function () use($crawler) {
         return $crawler->filter('.pricelabel.tcenter')->first()->text();
     }, 'title' => function () use($crawler) {
         return $crawler->filter('.offerheadinner > h1')->text();
     }, 'olx_id' => function () {
         return self::getOlxId($this->url->url);
     }, 'description' => function () use($crawler) {
         return $crawler->filter('#textContent')->text();
     }, 'date_string' => function () use($crawler) {
         return self::getDate($crawler);
     }, 'offer_number' => function () use($crawler) {
         return self::getOfferNumber($crawler);
     }];
     $failedFields = [];
     foreach ($fieldsCommands as $field => $command) {
         try {
             $offer->{$field} = trim($command());
         } catch (\Exception $e) {
             array_push($failedFields, $field);
         }
     }
     $detailsTables = $crawler->filter('table.details table.item');
     $details = [];
     foreach ($detailsTables as $detailsTable) {
         /** @var $detailsTable Crawler */
         try {
             $details[$detailsTable->getElementsByTagName('th')->item(0)->textContent] = preg_replace('/\\t*|\\n*|\\s{2,}/u', '', $detailsTable->getElementsByTagName('td')->item(0)->textContent);
         } catch (\Exception $e) {
             array_push($failedFields, $detailsTable->text());
         }
     }
     if ($details) {
         $offer->details = $details;
     }
     if ($failedFields and $failedFields !== ["price_string"]) {
         // lots of offers don't have assigned price
         $logger->warning('Failed fields', self::arrayInsert($context, 'failed_fields', $failedFields));
     }
     if ($olxTimestamp = self::parseDate($offer->date_string)) {
         $offer->created_at_olx = $olxTimestamp;
     }
     try {
         $offer->save();
     } catch (QueryException $e) {
         if (23000 === intval($e->getCode())) {
             return;
             // duplicate offer, finish job
         }
         $logger->error('', self::arrayInsert($context, 'exception', $e));
         throw $e;
     }
     if ($offer) {
         try {
             $detectPhones = new DetectPhones($offer);
             $detectPhones->handle();
         } catch (\Exception $e) {
             \Log::critical('Failed to detect phones', ['exception' => $e]);
         }
     }
     if ($offer->wasRecentlyCreated) {
         $photos = [];
         try {
             $photos = $crawler->filter('#bigGallery a')->reduce(function (Crawler $node, $i) {
                 return (bool) $node->attr('href');
             })->extract('href');
         } catch (Exception $e) {
             $logger->error('Failed to get photos', self::arrayInsert($context, 'exception', $e));
         }
         if ($photos) {
             foreach ($photos as $photo) {
                 $offer->photos()->create(['url' => $photo]);
             }
         }
         if ($locationString = self::getLocationString($crawler)) {
             if ($location = Location::where('location', $locationString)->first()) {
                 $offer->location()->associate($location);
                 $offer->save();
             } else {
                 try {
                     $location = Location::create(['location' => $locationString]);
                 } catch (QueryException $e) {
                     $logger->error('Unable to create Location', self::arrayInsert($context, 'location', $locationString));
                 }
                 if ($location instanceof Location) {
                     $offer->location()->associate($location);
                     $offer->save();
                 }
             }
         }
     }
     Event::fire(new OfferParsed($offer));
 }