public static function exec($url) { Log::log("Starting download of the url <" . $url . ">"); // Download the source $source = Downloader::get($url); if ($source == false) { Log::log("No source found. Exiting now."); exit; } Crawler::crawl((string) $source); /*$cat = []; if( @!!"" ){ echo "true"; }else{ echo "false"; }*/ }
public static function get_archive($arc_key) { $verbose = Config::$env == "dev" ? true : false; Log::log("<blue>Downloading archive #" . $arc_key . " - " . self::$archives[$arc_key]['text'] . " - " . self::$archives[$arc_key]['href']); $crawler; $valid = false; do { $source = Downloader::get(self::$archives[$arc_key]['href']); //$source = Downloader::get( "http://localhost:8000" ); $crawler = new DomCrawler((string) $source); $crawler = $crawler->filter('.base_pagina_articoli > div > a'); $count = count($crawler); Log::log("<blue>Download ended."); if ($count == 0) { Log::log("<red>Found 0 articles. Why? Retrying to download."); } else { $valid = true; } sleep(1); } while (!$valid); if ($verbose) { Log::log("<green>The download was successfull."); Log::log("<cyan>Found " . $count . " posts in the archive."); } foreach ($crawler as $art_key => $domElement) { if ($art_key >= Config::$status_post) { if ($verbose) { Log::log("<cyan>Article ID #" . $art_key . " of archive #" . $arc_key . "."); } Config::$status_post = 0; $href = $domElement->getAttribute('href'); $text = trim($domElement->nodeValue); $text = explode("\n", $text); $title = \utf8_decode(htmlentities(trim($text[0]))); $date = [0, 0, 0, 0, 0, 0]; preg_match("#(\\d+)\\/(\\d+)\\/(\\d+)\\s\\-\\s(\\d+):(\\d+)#is", $text[1], $date); $date = Carbon::create($date[3], $date[2], $date[1], $date[4], $date[5]); if ($verbose) { Log::log("<blue>[" . $date . "] - " . $title . " - " . $href); } $valid = false; $article = null; $categories = []; do { $source = Downloader::get($href); $crawl = new DomCrawler((string) $source); preg_match("#articolo\\/(.+?)\\/\\d+#is", $href, $match); $name = trim($match[1]); if ($verbose) { Log::log("<darkGreen>" . $name); } $article = $crawl->filter('.dettagli_articoli_riassunto > p'); if (count($article) == 0) { Log::log("<red>Cannot get the article #" . $art_key . ". Retrying."); } else { foreach ($article as $art) { $article = \utf8_decode(htmlentities(trim($art->nodeValue))); //var_dump( mb_detect_encoding( $art->nodeValue ) ); if ($verbose) { Log::log("<darkGreen>" . $article); } break; } } $cats = $crawl->filter('.dettagli_articoli_famiglie > a'); $categories = []; foreach ($cats as $cat) { $cat = $cat->nodeValue; if (@(!!trim($cat))) { $categories[] = $cat; } } Log::log("<darkGreen>Categories: " . (@(!!$categories) ? "none" : implode(", ", $categories))); if (@(!!$article)) { $valid = true; } } while (!$valid); $post = new Post($date, $article, $title, $name, @(!!$categories) ? $categories : false); $post->save(); self::$count++; if ($verbose) { Log::log("<red>Count: " . self::$count); } if (Config::$post_end !== false && self::$count >= Config::$post_end) { Log::log("<red>Ended."); exit; } } } }