protected function execute(InputInterface $input, OutputInterface $output) { $em = $this->getContainer()->get('doctrine')->getManager(); $pageRepository = $this->getContainer()->get('doctrine')->getRepository('S2bCrawlerBundle:Page'); $url = $input->getArgument('url'); // $depth = $input->getOption('depth'); if (filter_var($url, FILTER_VALIDATE_URL) === FALSE) { $output->writeln('<error>Invalid url</error>'); return; } $page = $pageRepository->findOneByUrl($url); if ($page) { $output->writeln('<error>Url already in database</error>'); return; } $page = new Page(); $page->setUrl($url); $page->setDepth(0); $em->persist($page); $em->flush(); $output->writeln('Url added successfully'); }
protected function execute(InputInterface $input, OutputInterface $output) { $doctrine = $this->getContainer()->get('doctrine'); $em = $doctrine->getManager(); $pageRepository = $doctrine->getRepository('S2bCrawlerBundle:Page'); $id = $input->getArgument('id'); $page = $pageRepository->findOneById($id); if (!$page) { $output->writeln("<error>Page not found</error>"); return; } if ($page->isCrawled()) { $output->writeln("<error>Page already crawled</error>"); return; } $client = new Client(); try { $response = $client->get($page->getUrl()); } catch (ClientException $e) { switch ($e->getResponse()->getStatusCode()) { case 404: $output->writeln('<error>Not found ' . $page->getUrl() . '</error>'); $page->setCrawledAt(new \DateTime()); $em->persist($page); $em->flush(); return; case 503: sleep(1); continue; } } $links = $this->parseLinks((string) $response->getBody(), $page->getUrl()); $links = $this->filterLinks($links); $crawled = new PageCrawled(); $crawled->setPage($page)->setContent((string) $response->getBody())->setLinks($links); $page->setCrawled($crawled)->setCrawledAt(new \DateTime()); if ($links) { $links_added = 0; foreach ($links as $i => $link) { if ($pageRepository->findOneByUrl($link)) { if ($output->isVeryVerbose()) { $output->writeln("Skipped #" . $i . ' ' . $link . ' - already in database'); } continue; } $linkPage = new Page(); $linkPage->setUrl($link)->setDepth($page->getDepth() + 1); $em->persist($linkPage); $em->flush(); $links_added++; if ($output->isVerbose()) { $output->writeln("Added #" . $i . ' ' . $link); } } $output->writeln('Added ' . $links_added . '/' . count($links) . ' links'); } $em->persist($crawled); $em->persist($page); $em->flush(); $output->writeln('Crawled ' . $page->getUrl()); }