Beispiel #1
0
 public static function exec($url)
 {
     Log::log("Starting download of the url <" . $url . ">");
     // Download the source
     $source = Downloader::get($url);
     if ($source == false) {
         Log::log("No source found. Exiting now.");
         exit;
     }
     Crawler::crawl((string) $source);
     /*$cat = [];
     		if( @!!"" ){
     			echo "true";
     		}else{
     			echo "false";
     		}*/
 }
Beispiel #2
0
 public static function get_archive($arc_key)
 {
     $verbose = Config::$env == "dev" ? true : false;
     Log::log("<blue>Downloading archive #" . $arc_key . " - " . self::$archives[$arc_key]['text'] . " - " . self::$archives[$arc_key]['href']);
     $crawler;
     $valid = false;
     do {
         $source = Downloader::get(self::$archives[$arc_key]['href']);
         //$source = Downloader::get( "http://localhost:8000" );
         $crawler = new DomCrawler((string) $source);
         $crawler = $crawler->filter('.base_pagina_articoli > div > a');
         $count = count($crawler);
         Log::log("<blue>Download ended.");
         if ($count == 0) {
             Log::log("<red>Found 0 articles. Why? Retrying to download.");
         } else {
             $valid = true;
         }
         sleep(1);
     } while (!$valid);
     if ($verbose) {
         Log::log("<green>The download was successfull.");
         Log::log("<cyan>Found " . $count . " posts in the archive.");
     }
     foreach ($crawler as $art_key => $domElement) {
         if ($art_key >= Config::$status_post) {
             if ($verbose) {
                 Log::log("<cyan>Article ID #" . $art_key . " of archive #" . $arc_key . ".");
             }
             Config::$status_post = 0;
             $href = $domElement->getAttribute('href');
             $text = trim($domElement->nodeValue);
             $text = explode("\n", $text);
             $title = \utf8_decode(htmlentities(trim($text[0])));
             $date = [0, 0, 0, 0, 0, 0];
             preg_match("#(\\d+)\\/(\\d+)\\/(\\d+)\\s\\-\\s(\\d+):(\\d+)#is", $text[1], $date);
             $date = Carbon::create($date[3], $date[2], $date[1], $date[4], $date[5]);
             if ($verbose) {
                 Log::log("<blue>[" . $date . "] - " . $title . " - " . $href);
             }
             $valid = false;
             $article = null;
             $categories = [];
             do {
                 $source = Downloader::get($href);
                 $crawl = new DomCrawler((string) $source);
                 preg_match("#articolo\\/(.+?)\\/\\d+#is", $href, $match);
                 $name = trim($match[1]);
                 if ($verbose) {
                     Log::log("<darkGreen>" . $name);
                 }
                 $article = $crawl->filter('.dettagli_articoli_riassunto > p');
                 if (count($article) == 0) {
                     Log::log("<red>Cannot get the article #" . $art_key . ". Retrying.");
                 } else {
                     foreach ($article as $art) {
                         $article = \utf8_decode(htmlentities(trim($art->nodeValue)));
                         //var_dump( mb_detect_encoding( $art->nodeValue ) );
                         if ($verbose) {
                             Log::log("<darkGreen>" . $article);
                         }
                         break;
                     }
                 }
                 $cats = $crawl->filter('.dettagli_articoli_famiglie > a');
                 $categories = [];
                 foreach ($cats as $cat) {
                     $cat = $cat->nodeValue;
                     if (@(!!trim($cat))) {
                         $categories[] = $cat;
                     }
                 }
                 Log::log("<darkGreen>Categories: " . (@(!!$categories) ? "none" : implode(", ", $categories)));
                 if (@(!!$article)) {
                     $valid = true;
                 }
             } while (!$valid);
             $post = new Post($date, $article, $title, $name, @(!!$categories) ? $categories : false);
             $post->save();
             self::$count++;
             if ($verbose) {
                 Log::log("<red>Count: " . self::$count);
             }
             if (Config::$post_end !== false && self::$count >= Config::$post_end) {
                 Log::log("<red>Ended.");
                 exit;
             }
         }
     }
 }