protected function execute(InputInterface $input, OutputInterface $output) { $domain = $input->getOption('domain'); $crawler = new \Arachnid\Crawler($domain, 3); $output->writeln(sprintf('<info>Crawling %s...</info>', $domain)); $crawler->traverse(); $links = $crawler->getLinks(); $output->writeln(sprintf('Collected <comment>%s</comment> URLs', count($links))); $table = $this->getHelper('table'); $table->setHeaders(array('Status', 'No. of URLs')); $counts = array(); $results = array(); foreach ($links as $url => $info) { if (!array_key_exists('status_code', $info)) { continue; } if (!array_key_exists($info['status_code'], $counts)) { $counts[$info['status_code']] = 1; } else { $counts[$info['status_code']]++; } if ($info['status_code'] !== 200) { if (!array_key_exists($info['status_code'], $results)) { $results[$info['status_code']] = array(); } $results[$info['status_code']][] = $info; } } $rows = array(); foreach ($counts as $key => $value) { $rows[] = array($key, $value); } $table->setRows($rows); $table->render($output); }
protected function execute(InputInterface $input, OutputInterface $output) { $depth = $input->getOption('links_depth'); $frequency = $input->getOption('frequency'); $base_url = $input->getArgument('base_url'); if (filter_var($base_url, FILTER_VALIDATE_URL) === false) { $output->writeln('<error>invalid base url, please make sure its full url; ex. http://example.com</error>'); return; } $valid_frequency_arr = array('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'); if (in_array($frequency, $valid_frequency_arr) === false) { $output->writeln("<error>invalid frequency provided, allowed values: " . implode(',', $valid_frequency_arr) . "</error>"); return; } $output->writeln("<info>begin crawling site</info>"); $output->writeln('<comment>begin crawling ' . $base_url . '</comment>'); $crawler = new \Arachnid\Crawler($base_url, $depth); $crawler->traverse(); $links = $crawler->getLinks(); $output->writeln("<info>" . count($links) . " links found in the url</info>"); $output->writeln("<comment>start generating sitemap file</comment>"); $dom_doc = $this->getSitemapDocument($output, $links, $frequency); $output->writeln("<comment>finished generating sitemap file</comment>"); $output->writeln(""); $sitemap_path = $input->getOption('sitemap_path'); if (empty($sitemap_path)) { $sitemap_path = $this->getContainer()->get('kernel')->getRootDir() . '/../web/sitemap.xml'; } try { $dom_doc->save($sitemap_path); $output->writeln("<info>sitemap file written to " . $sitemap_path . "</info>"); } catch (Exception $ex) { $output->writeln("<error>Error: " . $ex->getMessage() . " on line " . $ex->getLine() . "</error>"); } }
private function findPages($url) { require 'vendor/autoload.php'; // Initiate crawl $crawler = new \Arachnid\Crawler($url, 2); $crawler->traverse(); // Get link data $links = $crawler->getLinks(); $pages = []; foreach ($links as $link) { if (isset($link['absolute_url']) && !$link['external_link']) { $pages[] = $link['absolute_url']; } } return $pages; }
function crawl_page($url, $depth = 1) { //echo '<PRE>'; //if (ob_get_level() == 0) ob_start(); require_once 'vendor/autoload.php'; // Initiate crawl $crawler = new \Arachnid\Crawler($url, $depth); $crawler->traverse(); // Get link data $links = $crawler->getLinks(); //print_r($links); foreach ($links as $lnk => $link) { //print_r($link['links_text']); '<br />'; $job_title_file = fopen("job_titles.csv", 'r'); if (!empty($link['links_text'])) { foreach ($link['links_text'] as $tx) { while ($row = fgetcsv($job_title_file)) { //echo $lnk . '==='. $tx . '===' . $row[0] . '<br />'; if (preg_match("/\\b" . $row[0] . "\\b/i", $tx)) { //echo $lnk . '==='. $tx . '===' . $row[0] . '<br />'; $o_job_title = $tx; break; } } //end while fgetcsv if (!empty($o_job_title)) { $output[] = array($o_job_title, $lnk); } } //for link text } //empty links_text fclose($job_title_file); } //for each link return $output; }
<?php require '../vendor/autoload.php'; // Initiate crawl $crawler = new \Arachnid\Crawler('http://www.compagnieboisexotiques.com', 3); $crawler->traverse(); // Get link data $links = $crawler->getLinks();