Beispiel #1
0
 public function process(Url &$url)
 {
     $this->extractedUrls = [];
     $client = new \Goutte\Client();
     $crawler = $client->request('GET', $url->getUrl());
     $url->setStatus($client->getResponse()->getStatus() >= 400 ? Url::STATUS_ERROR : Url::STATUS_OK);
     $links = $crawler->filter('a');
     $links = $crawler->filter('a')->links();
     foreach ($links as $link) {
         $eu = new Url($link->getUri(), $url->getDepth() + 1);
         if (preg_match('#^http(s)?://fleapop.com#i', $eu->getUrl())) {
             $this->extractedUrls[] = $eu;
         }
     }
     //
     return true;
 }
 /**
  * @param Url $url
  * @return Crawler
  */
 protected function crawlUrl(&$url)
 {
     $client = $this->getWebClient();
     $uri = $url->getUrl();
     try {
         $crawler = $client->request('GET', $uri);
         $status = $client->getResponse()->getStatus();
         if ($status >= 400) {
             throw new HttpException($status);
         }
     } catch (\Exception $e) {
         $url->setStatus(Url::STATUS_ERROR);
         throw $e;
     }
     $url->setStatus(Url::STATUS_OK);
     return $crawler;
 }
 public function insert(Url $url)
 {
     try {
         $this->getMongoCollection()->insert(['_id' => $url->getId(), 'processing' => false, 'url' => $url->toArray()]);
     } catch (\MongoCursorException $e) {
         if ($e->getCode() === 11000) {
             return false;
         }
         throw $e;
     }
     $this->createIndexes();
     return true;
 }