예제 #1
0
 public static function main()
 {
     Log::open_file(Carbon::now()->format("F-Y-g-i-A") . ".log");
     Database::connect();
     Crawler::crawl();
     //echo htmlentities( \utf8_decode(  "(AGIELLE) - Galvagni (Cisl): Milano è una pentola a pressione, reagire subito - RPT" ) );
     /*	$query = "SELECT ID as 'identification', post_title FROM wp_posts";
     			$exec = Database::exec( $query );
     			while( $res = $exec->fetch_assoc() ){
     				//Log::log( $res[ 'post_title' ] );
     				$old = $res[ 'post_title' ];
     				$new = htmlentities( $old );
     				$query = "UPDATE wp_posts SET post_title = '".Database::escape( @!!$new ? $new : $old )."' WHERE id = ".$res[ 'identification' ];
     				Log::log( "<red>".$query );
     				
     				Database::exec( $query );
     				//exit();
     				Log::log( "Updated post #".$res[ 'identification' ] );
     			}*/
     /*$post = new Post( Carbon::now(), "Wassup bruh, dis is the contnt", "Taitol", "wassup-bruh1", [ "New Cat 1", "New Cat 2" ] );
     		$post->save();
     		$post = new Post( Carbon::now(), "Wassup bruh, dis is the contnt", "Taitol", "wassup-bruh2", [ "New Cat 2", "New Cat 3" ] );
     		$post->save();*/
     Log::close_file();
 }
예제 #2
0
파일: Filter.php 프로젝트: Wufe/Scraper
 public static function filter($root, $parent, $nodes, $verbose = true)
 {
     $eligible = [];
     $filtered = [];
     $ncount = count($nodes);
     $invalid = 0;
     foreach (self::$identifiers as $name => $id) {
         foreach ($nodes as $node_key => $node) {
             $identifier = call_user_func($id['class'] . '::' . $id['method'], $node);
             if (@(!!$identifier)) {
                 $valid = true;
                 foreach (self::$filters as $filter) {
                     if ($filter['for'] == $id['name'] || $filter['for'] == '*') {
                         $valid = call_user_func($filter['class'] . "::" . $filter['method'], $node) ? $valid : false;
                         // Add filtered info into node
                     }
                 }
                 if (@(!!$valid)) {
                     $found = false;
                     foreach ($filtered as $filtered_key => $filtered_value) {
                         if ($filtered_value['pattern'] == $identifier) {
                             $found = $filtered_key;
                         }
                     }
                     if ($found !== false) {
                         $filtered[$found]['nodes'][] = $node;
                     } else {
                         $filtered[] = ['pattern' => $identifier, 'id_by' => $id['method'], "id" => ['name' => $id['name'], 'value' => $id['prefix'] . $identifier . $id['postfix']], 'nodes' => [$node]];
                     }
                 } else {
                     $invalid++;
                 }
             }
         }
     }
     foreach ($filtered as $filtered_value) {
         $nodes = $filtered_value['nodes'];
         $nodes_count = count($nodes);
         $pattern = $filtered_value['pattern'];
         $id = $filtered_value['id'];
         if ($nodes_count > 3) {
             // needs to be changed
             if ($verbose) {
                 Log::log("\tThere are " . $nodes_count . " nodes with " . strtoupper($id['name']) . " pattern <" . $id['value'] . "> eligible.");
             }
             $eligible[] = ['count' => $nodes_count, 'pattern' => $pattern, 'id_by' => $filtered_value['id_by'], 'nodes' => $nodes];
         }
     }
     if ($verbose && $invalid > 0) {
         //Log::log( "\t".$invalid."/".$ncount." nodes invalidated by filters." ); // Needs to be corrected, because $invalid is incremented by 1 each $filter on each $identifier
     }
     if (count($eligible) > 0) {
         return true;
     } else {
         return false;
     }
 }
예제 #3
0
파일: Crawler.php 프로젝트: Wufe/Scraper
 public static function crawl($source)
 {
     $crawler = new DomCrawler($source);
     Log::log("Generating tree..");
     $node_tree = Parser::parse($crawler, 0, "", false);
     Log::log("Calculating components..");
     $augmented_tree = Parser::augment($node_tree[0]);
     Log::log("Calculating path..");
     $marked_tree = Parser::identify_parent_path($augmented_tree);
     file_put_contents("a-tree", print_r($marked_tree, true));
     Log::log("Scanning the tree for a pattern..");
     Scanner::scan($marked_tree);
     // Test di funzionamento delle funzioni get_node_from_path e get_parent_from_path
     //Log::log( Tree::get_node_from_path( $marked_tree, "0,0" )[ 'tag' ] );
     //Log::log( Tree::get_parent_from_path( $marked_tree, "0,0" )[ 'tag' ] );
 }
예제 #4
0
 public static function connect()
 {
     $verbose = Config::$env == "dev" ? true : false;
     if ($verbose) {
         Log::log("<blue>Connecting to MySQL " . Config::$db_user . "@" . Config::$db_host . ".. ");
     }
     $mysqli = @new \mysqli(Config::$db_host, Config::$db_user, Config::$db_pass, Config::$db_name);
     if ($mysqli->connect_errno) {
         Log::log("<red>[" . $mysqli->connect_errno . "] " . $mysqli->connect_error);
         exit;
     } else {
         self::$conn = $mysqli;
         if ($verbose) {
             Log::log("<green>Connected.");
         }
     }
 }
예제 #5
0
파일: Main.php 프로젝트: Wufe/Scraper
 public static function exec($url)
 {
     Log::log("Starting download of the url <" . $url . ">");
     // Download the source
     $source = Downloader::get($url);
     if ($source == false) {
         Log::log("No source found. Exiting now.");
         exit;
     }
     Crawler::crawl((string) $source);
     /*$cat = [];
     		if( @!!"" ){
     			echo "true";
     		}else{
     			echo "false";
     		}*/
 }
예제 #6
0
파일: Scanner.php 프로젝트: Wufe/Scraper
 public static function scan($node)
 {
     // The tweaker has to be applied BEFORE the scan
     Tweaker::apply();
     $priority = Scanner::scan_for_pattern($node);
     Log::log("Tweaking results..");
     //$priority = Tweaker::apply( $priority );
     // Readable test
     $prio = [];
     foreach ($priority as $node) {
         unset($node['children']);
         unset($node['node']);
         $prio[] = $node;
     }
     $priority = $prio;
     // End readable test
     // Test
     Log::log("Printing results.");
     file_put_contents("priority", print_r($priority, true));
 }
예제 #7
0
파일: Downloader.php 프로젝트: Wufe/Scraper
 public static function get($url)
 {
     $client = new Guzzle();
     $res = null;
     $tries = 0;
     $error = null;
     do {
         try {
             $res = $client->request('GET', $url, ['timeout' => 10, 'http_errors' => false]);
         } catch (\GuzzleHttp\Exception\ConnectException $e) {
             $error = "ConnectException: " . $e->getMessage();
         } catch (\GuzzleHttp\Exception\RequestException $e) {
             $error = "RequestException: " . $e->getMessage();
         }
         $tries++;
     } while ((!$res || $res->getStatusCode() == 200) && $tries < 10);
     if ($res == null) {
         Log::log($error);
     }
     return $res != null ? $res->getBody() : false;
 }
예제 #8
0
파일: Tweaker.php 프로젝트: Wufe/Scraper
 public static function deletion($priority)
 {
     for ($i = 0; $i < count($priority); $i++) {
         $node = $priority[$i];
         $unset = false;
         foreach (self::$blacklist['parent'] as $parent) {
             if ($node['tag'] == $parent) {
                 unset($priority[$i]);
                 $unset = true;
             }
         }
         foreach (self::$blacklist['child'] as $child) {
             $old_children = $node['children'];
             $old_count = count($old_children);
             for ($a = 0; $a < count($old_children); $a++) {
                 $node_child = $old_children[$a];
                 if (@(!!$node_child['tag']) && $node_child['tag'] == $child) {
                     unset($old_children[$a]);
                 }
             }
             if (!$unset) {
                 $new_children = $old_children;
                 if (count($new_children) > 0) {
                     if (count($new_children) != $old_count) {
                         Log::log("\tUnset " . ($old_count - count($new_children)) . " children from [" . $node['tag'] . "][" . ($i - 1) . "]");
                     }
                     unset($priority[$i]['children']);
                     $priority[$i]['children'] = $new_children;
                 } else {
                     Log::log("\tCompletely unset [" . $node['tag'] . "][" . ($i - 1) . "]");
                     unset($priority[$i]);
                 }
             }
         }
     }
     return $priority;
 }
예제 #9
0
 public function save_id()
 {
     $verbose = Config::$env == "dev" ? true : false;
     $id = null;
     if ($verbose) {
         Log::log("<blue>Creating the category `" . $this->name . "`..");
     }
     $slug = Database::escape(strtolower(App::clean($this->name)));
     $query = "INSERT INTO `" . Config::$db_name . "`.`" . Config::$wp_prefix . "terms`( `name`, `slug`, `term_group` ) VALUE( '" . Database::escape($this->name) . "', '" . $slug . "', 0 )";
     $exec = Database::exec($query);
     if ($exec === false) {
         Log::log("<red>Cannot save the category:");
         Log::log("<red>[" . Database::$conn->errno . "] " . Database::$conn->error);
         Log::log("<red>" . $query);
         exit;
     } else {
         $term_id = Database::$conn->insert_id;
         if ($verbose) {
             Log::log("<green>Created the category `" . $this->name . "` with the slug `" . $slug . "` and ID '" . $term_id . "'.");
         }
     }
     $query = "INSERT INTO `" . Config::$db_name . "`.`" . Config::$wp_prefix . "term_taxonomy`( `term_id`, `taxonomy`, `description`, `parent`, `count`) VALUE( '" . $term_id . "', 'category', '', 0, 0 );";
     $exec = Database::exec($query);
     if ($exec === false) {
         Log::log("<red>Cannot save the taxonomy:");
         Log::log("<red>[" . Database::$conn->errno . "] " . Database::$conn->error);
         Log::log("<red>" . $query);
         exit;
     } else {
         if ($verbose) {
             Log::log("<green>Taxonomy created.");
         }
         $tax_id = Database::$conn->insert_id;
     }
     return [$term_id, $tax_id];
 }
예제 #10
0
 public static function get_archive($arc_key)
 {
     $verbose = Config::$env == "dev" ? true : false;
     Log::log("<blue>Downloading archive #" . $arc_key . " - " . self::$archives[$arc_key]['text'] . " - " . self::$archives[$arc_key]['href']);
     $crawler;
     $valid = false;
     do {
         $source = Downloader::get(self::$archives[$arc_key]['href']);
         //$source = Downloader::get( "http://localhost:8000" );
         $crawler = new DomCrawler((string) $source);
         $crawler = $crawler->filter('.base_pagina_articoli > div > a');
         $count = count($crawler);
         Log::log("<blue>Download ended.");
         if ($count == 0) {
             Log::log("<red>Found 0 articles. Why? Retrying to download.");
         } else {
             $valid = true;
         }
         sleep(1);
     } while (!$valid);
     if ($verbose) {
         Log::log("<green>The download was successfull.");
         Log::log("<cyan>Found " . $count . " posts in the archive.");
     }
     foreach ($crawler as $art_key => $domElement) {
         if ($art_key >= Config::$status_post) {
             if ($verbose) {
                 Log::log("<cyan>Article ID #" . $art_key . " of archive #" . $arc_key . ".");
             }
             Config::$status_post = 0;
             $href = $domElement->getAttribute('href');
             $text = trim($domElement->nodeValue);
             $text = explode("\n", $text);
             $title = \utf8_decode(htmlentities(trim($text[0])));
             $date = [0, 0, 0, 0, 0, 0];
             preg_match("#(\\d+)\\/(\\d+)\\/(\\d+)\\s\\-\\s(\\d+):(\\d+)#is", $text[1], $date);
             $date = Carbon::create($date[3], $date[2], $date[1], $date[4], $date[5]);
             if ($verbose) {
                 Log::log("<blue>[" . $date . "] - " . $title . " - " . $href);
             }
             $valid = false;
             $article = null;
             $categories = [];
             do {
                 $source = Downloader::get($href);
                 $crawl = new DomCrawler((string) $source);
                 preg_match("#articolo\\/(.+?)\\/\\d+#is", $href, $match);
                 $name = trim($match[1]);
                 if ($verbose) {
                     Log::log("<darkGreen>" . $name);
                 }
                 $article = $crawl->filter('.dettagli_articoli_riassunto > p');
                 if (count($article) == 0) {
                     Log::log("<red>Cannot get the article #" . $art_key . ". Retrying.");
                 } else {
                     foreach ($article as $art) {
                         $article = \utf8_decode(htmlentities(trim($art->nodeValue)));
                         //var_dump( mb_detect_encoding( $art->nodeValue ) );
                         if ($verbose) {
                             Log::log("<darkGreen>" . $article);
                         }
                         break;
                     }
                 }
                 $cats = $crawl->filter('.dettagli_articoli_famiglie > a');
                 $categories = [];
                 foreach ($cats as $cat) {
                     $cat = $cat->nodeValue;
                     if (@(!!trim($cat))) {
                         $categories[] = $cat;
                     }
                 }
                 Log::log("<darkGreen>Categories: " . (@(!!$categories) ? "none" : implode(", ", $categories)));
                 if (@(!!$article)) {
                     $valid = true;
                 }
             } while (!$valid);
             $post = new Post($date, $article, $title, $name, @(!!$categories) ? $categories : false);
             $post->save();
             self::$count++;
             if ($verbose) {
                 Log::log("<red>Count: " . self::$count);
             }
             if (Config::$post_end !== false && self::$count >= Config::$post_end) {
                 Log::log("<red>Ended.");
                 exit;
             }
         }
     }
 }
예제 #11
0
 public static function dump($obj, $depth = 0, $exp = "")
 {
     if ($depth === true) {
         $exp = $depth;
         $depth = 0;
     }
     if (is_bool($exp) && $exp === true) {
         $exp = "";
     }
     /*if( !$exp )
     		usleep( 10000 );*/
     foreach ($obj as $key => $value) {
         $string = str_repeat(" ", $depth * 3);
         if (is_object($obj)) {
             $string .= "[" . $key . ":" . get_class($obj) . "] => ";
         } else {
             $string .= "[" . $key . "] => ";
         }
         if (is_object($value)) {
             $string .= get_class($value);
             if (!is_string($exp)) {
                 $string = "<darkYellow>" . $string;
             }
             if (!is_string($exp)) {
                 Log::log($string);
             } else {
                 $exp .= PHP_EOL . $string;
             }
             $exp = self::dump($value, $depth + 1, $exp);
         } else {
             if (is_array($value)) {
                 $string .= "Array";
                 if (@(!$value)) {
                     $string .= "()";
                     if (!$exp) {
                         $string = "<red>" . $string;
                     }
                 } else {
                     if (!$exp) {
                         $string = "<blue>" . $string;
                     }
                 }
                 if (!is_string($exp)) {
                     Log::log($string);
                 } else {
                     $exp .= PHP_EOL . $string;
                 }
                 $exp = self::dump($value, $depth + 1, $exp);
             } else {
                 if (is_bool($value)) {
                     if ($value) {
                         if (!is_string($exp)) {
                             $string .= "<darkGreen>bool(true)";
                         } else {
                             $string .= "bool(true)";
                         }
                     } else {
                         if (!is_string($exp)) {
                             $string .= "<darkGreen>bool(false)";
                         } else {
                             $string .= "bool(false)";
                         }
                     }
                 } else {
                     if (is_string($value)) {
                         $string .= "string(" . $value . ")";
                     }
                 }
                 if (!is_string($exp)) {
                     Log::log($string);
                 } else {
                     $exp .= PHP_EOL . $string;
                 }
             }
         }
     }
     return $exp;
 }
예제 #12
0
<?php

use Scraper\Middleware\Log;
use Scraper\Controllers\Main;
ini_set("max_execution_time", "900000");
require 'vendor/autoload.php';
Log::log("<red>Starting main controller.");
Main::main();
예제 #13
0
 public function save()
 {
     $verbose = Config::$env == "dev" ? true : false;
     $post_save_query = $this->get_query();
     $post_saved = Database::exec($post_save_query);
     if ($post_saved === false) {
         Log::log("<red>Cannot save the article:");
         Log::log("<red>[" . Database::$conn->errno . "] " . Database::$conn->error);
         Log::log("<red>" . $post_save_query);
         exit;
     }
     $this->post_id = Database::$conn->insert_id;
     if ($verbose) {
         Log::log("<green>Article with ID [" . $this->post_id . "] saved.");
     }
     foreach ($this->category as $cat_key => $cat) {
         $cat->check_exists();
         $relation_query = "INSERT INTO `" . Config::$db_name . "`.`" . Config::$wp_prefix . "term_relationships` ( `object_id`, `term_taxonomy_id`, `term_order` ) VALUE ( " . $this->post_id . ", " . $cat->tax_id . ", 0 )";
         $relation_exec = Database::exec($relation_query);
         if ($relation_exec === false) {
             Log::log("<red>Cannot save the link between the article and the taxonomy:");
             Log::log("<red>[" . Database::$conn->errno . "] " . Database::$conn->error);
             Log::log("<red>" . $relation_query);
             exit;
         }
         $count_query = "UPDATE `" . Config::$db_name . "`.`" . Config::$wp_prefix . "term_taxonomy` SET count = count +1 WHERE `term_taxonomy_id` = " . $cat->tax_id . ";";
         $count_exec = Database::exec($count_query);
         if ($count_exec === false) {
             Log::log("<red>Cannot add the article to the counter:");
             Log::log("<red>[" . Database::$conn->errno() . "] " . Database::$conn->error);
             Log::log("<red>" . $count_query);
             exit;
         }
     }
     Log::log("<green>Article #" . $this->post_id . " saved with its categories.");
 }
예제 #14
0
파일: Parser.php 프로젝트: Wufe/Scraper
 public static function parse($node, $deep = 0, $xPath = "", $verbose = false)
 {
     $ret_obj = [];
     $spaces = str_repeat(" ", $deep * 5);
     $count = count($node);
     $elements_count = [];
     if ($count > 0) {
         foreach ($node as $domElement) {
             $node_obj = [];
             $nodeName = $domElement->nodeName;
             $nodeValue = $domElement->nodeValue;
             $tagName = isset($domElement->tagName) ? $domElement->tagName : false;
             $parentNode = $domElement->parentNode;
             $was_verbose = $verbose;
             if (Parser::get_id($domElement) == "g-items-atf") {
                 //$verbose = true;
             }
             // Generation of an absolute xPath
             $myPath = "";
             if (isset($tagName)) {
                 if (!isset($elements_count[$tagName])) {
                     $elements_count[$tagName] = 1;
                 } else {
                     $elements_count[$tagName]++;
                 }
                 $myPath = $xPath . ($xPath == "" ? "descendant-or-self::" : "/") . $tagName . (Parser::tags_count($node, $tagName) > 1 ? "[position()=" . $elements_count[$tagName] . "]" : "");
             }
             // Logging node by name with a tree structure
             $log_string = "[" . $deep . "]" . (strlen($deep . "") < 2 ? " " : "");
             if ($deep == 0) {
                 $log_string .= $nodeName;
             } else {
                 if ($deep > 1) {
                     $tmp_string = "|";
                     $tdeep = 0;
                     while ($tdeep < $deep - 1) {
                         $tmp_string .= str_repeat(" ", 4) . "|";
                         $tdeep++;
                     }
                     $log_string .= $tmp_string;
                 } else {
                     if ($deep == 1) {
                         $log_string .= "|";
                     }
                 }
                 $log_string .= str_repeat("-", 4) . $nodeName;
             }
             if (get_class($domElement) == "DOMElement" && @(!!$domElement->getAttribute('id'))) {
                 $log_string .= "[#" . $domElement->getAttribute('id') . "]";
             }
             //$log_string .= "[".get_class( $domElement )."]";
             if ($verbose) {
                 Log::log($log_string);
             }
             $node_obj = ['name' => $nodeName, 'value' => "", 'parent' => "", 'node' => $domElement];
             if ($tagName !== false) {
                 $node_obj['tag'] = $tagName;
             }
             $children = Parser::parse($domElement->childNodes, $deep + 1, $myPath, $verbose);
             if (!!$children) {
                 $node_obj['children'] = $children;
             }
             if ($myPath != "") {
                 $node_obj['xPath'] = $myPath;
             }
             $ret_obj[] = $node_obj;
         }
         return $ret_obj;
     } else {
         return false;
     }
 }