public function process(Url &$url) { $this->extractedUrls = []; $client = new \Goutte\Client(); $crawler = $client->request('GET', $url->getUrl()); $url->setStatus($client->getResponse()->getStatus() >= 400 ? Url::STATUS_ERROR : Url::STATUS_OK); $links = $crawler->filter('a'); $links = $crawler->filter('a')->links(); foreach ($links as $link) { $eu = new Url($link->getUri(), $url->getDepth() + 1); if (preg_match('#^http(s)?://fleapop.com#i', $eu->getUrl())) { $this->extractedUrls[] = $eu; } } // return true; }
/** * @param Url $url * @return Crawler */ protected function crawlUrl(&$url) { $client = $this->getWebClient(); $uri = $url->getUrl(); try { $crawler = $client->request('GET', $uri); $status = $client->getResponse()->getStatus(); if ($status >= 400) { throw new HttpException($status); } } catch (\Exception $e) { $url->setStatus(Url::STATUS_ERROR); throw $e; } $url->setStatus(Url::STATUS_OK); return $crawler; }
public function insert(Url $url) { try { $this->getMongoCollection()->insert(['_id' => $url->getId(), 'processing' => false, 'url' => $url->toArray()]); } catch (\MongoCursorException $e) { if ($e->getCode() === 11000) { return false; } throw $e; } $this->createIndexes(); return true; }