protected function execute(InputInterface $input, OutputInterface $output)
 {
     $domain = $input->getOption('domain');
     $crawler = new \Arachnid\Crawler($domain, 3);
     $output->writeln(sprintf('<info>Crawling %s...</info>', $domain));
     $crawler->traverse();
     $links = $crawler->getLinks();
     $output->writeln(sprintf('Collected <comment>%s</comment> URLs', count($links)));
     $table = $this->getHelper('table');
     $table->setHeaders(array('Status', 'No. of URLs'));
     $counts = array();
     $results = array();
     foreach ($links as $url => $info) {
         if (!array_key_exists('status_code', $info)) {
             continue;
         }
         if (!array_key_exists($info['status_code'], $counts)) {
             $counts[$info['status_code']] = 1;
         } else {
             $counts[$info['status_code']]++;
         }
         if ($info['status_code'] !== 200) {
             if (!array_key_exists($info['status_code'], $results)) {
                 $results[$info['status_code']] = array();
             }
             $results[$info['status_code']][] = $info;
         }
     }
     $rows = array();
     foreach ($counts as $key => $value) {
         $rows[] = array($key, $value);
     }
     $table->setRows($rows);
     $table->render($output);
 }
 protected function execute(InputInterface $input, OutputInterface $output)
 {
     $depth = $input->getOption('links_depth');
     $frequency = $input->getOption('frequency');
     $base_url = $input->getArgument('base_url');
     if (filter_var($base_url, FILTER_VALIDATE_URL) === false) {
         $output->writeln('<error>invalid base url, please make sure its full url; ex. http://example.com</error>');
         return;
     }
     $valid_frequency_arr = array('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never');
     if (in_array($frequency, $valid_frequency_arr) === false) {
         $output->writeln("<error>invalid frequency provided, allowed values: " . implode(',', $valid_frequency_arr) . "</error>");
         return;
     }
     $output->writeln("<info>begin crawling site</info>");
     $output->writeln('<comment>begin crawling ' . $base_url . '</comment>');
     $crawler = new \Arachnid\Crawler($base_url, $depth);
     $crawler->traverse();
     $links = $crawler->getLinks();
     $output->writeln("<info>" . count($links) . " links found in the url</info>");
     $output->writeln("<comment>start generating sitemap file</comment>");
     $dom_doc = $this->getSitemapDocument($output, $links, $frequency);
     $output->writeln("<comment>finished generating sitemap file</comment>");
     $output->writeln("");
     $sitemap_path = $input->getOption('sitemap_path');
     if (empty($sitemap_path)) {
         $sitemap_path = $this->getContainer()->get('kernel')->getRootDir() . '/../web/sitemap.xml';
     }
     try {
         $dom_doc->save($sitemap_path);
         $output->writeln("<info>sitemap file written to " . $sitemap_path . "</info>");
     } catch (Exception $ex) {
         $output->writeln("<error>Error: " . $ex->getMessage() . " on line " . $ex->getLine() . "</error>");
     }
 }
Example #3
0
 private function findPages($url)
 {
     require 'vendor/autoload.php';
     // Initiate crawl
     $crawler = new \Arachnid\Crawler($url, 2);
     $crawler->traverse();
     // Get link data
     $links = $crawler->getLinks();
     $pages = [];
     foreach ($links as $link) {
         if (isset($link['absolute_url']) && !$link['external_link']) {
             $pages[] = $link['absolute_url'];
         }
     }
     return $pages;
 }
Example #4
0
function crawl_page($url, $depth = 1)
{
    //echo '<PRE>';
    //if (ob_get_level() == 0) ob_start();
    require_once 'vendor/autoload.php';
    // Initiate crawl
    $crawler = new \Arachnid\Crawler($url, $depth);
    $crawler->traverse();
    // Get link data
    $links = $crawler->getLinks();
    //print_r($links);
    foreach ($links as $lnk => $link) {
        //print_r($link['links_text']); '<br />';
        $job_title_file = fopen("job_titles.csv", 'r');
        if (!empty($link['links_text'])) {
            foreach ($link['links_text'] as $tx) {
                while ($row = fgetcsv($job_title_file)) {
                    //echo $lnk . '==='. $tx . '===' .  $row[0] . '<br />';
                    if (preg_match("/\\b" . $row[0] . "\\b/i", $tx)) {
                        //echo $lnk . '==='. $tx . '===' .  $row[0] . '<br />';
                        $o_job_title = $tx;
                        break;
                    }
                }
                //end while fgetcsv
                if (!empty($o_job_title)) {
                    $output[] = array($o_job_title, $lnk);
                }
            }
            //for link text
        }
        //empty links_text
        fclose($job_title_file);
    }
    //for each link
    return $output;
}
Example #5
0
<?php

require '../vendor/autoload.php';
// Initiate crawl
$crawler = new \Arachnid\Crawler('http://www.compagnieboisexotiques.com', 3);
$crawler->traverse();
// Get link data
$links = $crawler->getLinks();
Example #6
0
 public function crawlWebPage(Webpage $webpage, $mode = 'normal')
 {
     if (!$webpage->needsCrawl()) {
         return false;
     }
     // if it will crash we will remember it
     $webpage->last_status_code = Webpage::STATUS_CODE_CRASH;
     $webpage->save();
     $url = $webpage->url;
     if ($mode == 'debug') {
         //echo $url;
         return true;
     }
     $crawler = new \Arachnid\Crawler($url, 1);
     $crawler->traverse();
     $html = $crawler->getHtml();
     $binary = $crawler->getBinary();
     $tmpLinks = $crawler->getInternalLinks();
     $links = $this->filterLinks($tmpLinks, $webpage);
     $statusCode = $crawler->getStatusCode();
     $snapshot = Snapshot::create(['html' => $html, 'binary' => $binary, 'webpage_id' => $webpage->id, 'status_code' => $statusCode]);
     $snapshot->processChange();
     foreach ($links as $link) {
         Webpage::firstOrCreate(['url' => $link, 'site_id' => $webpage->site_id]);
     }
     $webpage->crawlcount++;
     $webpage->last_status_code = $statusCode;
     $webpage->save();
     return true;
 }