Arachnid\Crawler::getLinks PHP Exemples de code

Exemple #1

0

Afficher le fichier

Fichier : CollectUrlsCommand.php Projet : highestgoodlikewater/spidersmoke

 protected function execute(InputInterface $input, OutputInterface $output)
 {
     $domain = $input->getOption('domain');
     $crawler = new \Arachnid\Crawler($domain, 3);
     $output->writeln(sprintf('<info>Crawling %s...</info>', $domain));
     $crawler->traverse();
     $links = $crawler->getLinks();
     $output->writeln(sprintf('Collected <comment>%s</comment> URLs', count($links)));
     $table = $this->getHelper('table');
     $table->setHeaders(array('Status', 'No. of URLs'));
     $counts = array();
     $results = array();
     foreach ($links as $url => $info) {
         if (!array_key_exists('status_code', $info)) {
             continue;
         }
         if (!array_key_exists($info['status_code'], $counts)) {
             $counts[$info['status_code']] = 1;
         } else {
             $counts[$info['status_code']]++;
         }
         if ($info['status_code'] !== 200) {
             if (!array_key_exists($info['status_code'], $results)) {
                 $results[$info['status_code']] = array();
             }
             $results[$info['status_code']][] = $info;
         }
     }
     $rows = array();
     foreach ($counts as $key => $value) {
         $rows[] = array($key, $value);
     }
     $table->setRows($rows);
     $table->render($output);
 }

Exemple #2

0

Afficher le fichier

Fichier : GenerateSitemapCommand.php Projet : zrashwani/ArachnidSitemapBundle

 protected function execute(InputInterface $input, OutputInterface $output)
 {
     $depth = $input->getOption('links_depth');
     $frequency = $input->getOption('frequency');
     $base_url = $input->getArgument('base_url');
     if (filter_var($base_url, FILTER_VALIDATE_URL) === false) {
         $output->writeln('<error>invalid base url, please make sure its full url; ex. http://example.com</error>');
         return;
     }
     $valid_frequency_arr = array('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never');
     if (in_array($frequency, $valid_frequency_arr) === false) {
         $output->writeln("<error>invalid frequency provided, allowed values: " . implode(',', $valid_frequency_arr) . "</error>");
         return;
     }
     $output->writeln("<info>begin crawling site</info>");
     $output->writeln('<comment>begin crawling ' . $base_url . '</comment>');
     $crawler = new \Arachnid\Crawler($base_url, $depth);
     $crawler->traverse();
     $links = $crawler->getLinks();
     $output->writeln("<info>" . count($links) . " links found in the url</info>");
     $output->writeln("<comment>start generating sitemap file</comment>");
     $dom_doc = $this->getSitemapDocument($output, $links, $frequency);
     $output->writeln("<comment>finished generating sitemap file</comment>");
     $output->writeln("");
     $sitemap_path = $input->getOption('sitemap_path');
     if (empty($sitemap_path)) {
         $sitemap_path = $this->getContainer()->get('kernel')->getRootDir() . '/../web/sitemap.xml';
     }
     try {
         $dom_doc->save($sitemap_path);
         $output->writeln("<info>sitemap file written to " . $sitemap_path . "</info>");
     } catch (Exception $ex) {
         $output->writeln("<error>Error: " . $ex->getMessage() . " on line " . $ex->getLine() . "</error>");
     }
 }

Exemple #3

0

Afficher le fichier

Fichier : WebsitesController.php Projet : jbgtmartin/rich

 private function findPages($url)
 {
     require 'vendor/autoload.php';
     // Initiate crawl
     $crawler = new \Arachnid\Crawler($url, 2);
     $crawler->traverse();
     // Get link data
     $links = $crawler->getLinks();
     $pages = [];
     foreach ($links as $link) {
         if (isset($link['absolute_url']) && !$link['external_link']) {
             $pages[] = $link['absolute_url'];
         }
     }
     return $pages;
 }

Exemple #4

0

Afficher le fichier

Fichier : index.php Projet : sarangpatel/jobscrapperv2

function crawl_page($url, $depth = 1)
{
    //echo '<PRE>';
    //if (ob_get_level() == 0) ob_start();
    require_once 'vendor/autoload.php';
    // Initiate crawl
    $crawler = new \Arachnid\Crawler($url, $depth);
    $crawler->traverse();
    // Get link data
    $links = $crawler->getLinks();
    //print_r($links);
    foreach ($links as $lnk => $link) {
        //print_r($link['links_text']); '<br />';
        $job_title_file = fopen("job_titles.csv", 'r');
        if (!empty($link['links_text'])) {
            foreach ($link['links_text'] as $tx) {
                while ($row = fgetcsv($job_title_file)) {
                    //echo $lnk . '==='. $tx . '===' .  $row[0] . '<br />';
                    if (preg_match("/\\b" . $row[0] . "\\b/i", $tx)) {
                        //echo $lnk . '==='. $tx . '===' .  $row[0] . '<br />';
                        $o_job_title = $tx;
                        break;
                    }
                }
                //end while fgetcsv
                if (!empty($o_job_title)) {
                    $output[] = array($o_job_title, $lnk);
                }
            }
            //for link text
        }
        //empty links_text
        fclose($job_title_file);
    }
    //for each link
    return $output;
}

Exemple #5

0

Afficher le fichier

Fichier : run.php Projet : jbgtmartin/rich

<?php

require '../vendor/autoload.php';
// Initiate crawl
$crawler = new \Arachnid\Crawler('http://www.compagnieboisexotiques.com', 3);
$crawler->traverse();
// Get link data
$links = $crawler->getLinks();

PHP Arachnid\Crawler::getLinks Exemples