public static function main() { Log::open_file(Carbon::now()->format("F-Y-g-i-A") . ".log"); Database::connect(); Crawler::crawl(); //echo htmlentities( \utf8_decode( "(AGIELLE) - Galvagni (Cisl): Milano è una pentola a pressione, reagire subito - RPT" ) ); /* $query = "SELECT ID as 'identification', post_title FROM wp_posts"; $exec = Database::exec( $query ); while( $res = $exec->fetch_assoc() ){ //Log::log( $res[ 'post_title' ] ); $old = $res[ 'post_title' ]; $new = htmlentities( $old ); $query = "UPDATE wp_posts SET post_title = '".Database::escape( @!!$new ? $new : $old )."' WHERE id = ".$res[ 'identification' ]; Log::log( "<red>".$query ); Database::exec( $query ); //exit(); Log::log( "Updated post #".$res[ 'identification' ] ); }*/ /*$post = new Post( Carbon::now(), "Wassup bruh, dis is the contnt", "Taitol", "wassup-bruh1", [ "New Cat 1", "New Cat 2" ] ); $post->save(); $post = new Post( Carbon::now(), "Wassup bruh, dis is the contnt", "Taitol", "wassup-bruh2", [ "New Cat 2", "New Cat 3" ] ); $post->save();*/ Log::close_file(); }
public static function filter($root, $parent, $nodes, $verbose = true) { $eligible = []; $filtered = []; $ncount = count($nodes); $invalid = 0; foreach (self::$identifiers as $name => $id) { foreach ($nodes as $node_key => $node) { $identifier = call_user_func($id['class'] . '::' . $id['method'], $node); if (@(!!$identifier)) { $valid = true; foreach (self::$filters as $filter) { if ($filter['for'] == $id['name'] || $filter['for'] == '*') { $valid = call_user_func($filter['class'] . "::" . $filter['method'], $node) ? $valid : false; // Add filtered info into node } } if (@(!!$valid)) { $found = false; foreach ($filtered as $filtered_key => $filtered_value) { if ($filtered_value['pattern'] == $identifier) { $found = $filtered_key; } } if ($found !== false) { $filtered[$found]['nodes'][] = $node; } else { $filtered[] = ['pattern' => $identifier, 'id_by' => $id['method'], "id" => ['name' => $id['name'], 'value' => $id['prefix'] . $identifier . $id['postfix']], 'nodes' => [$node]]; } } else { $invalid++; } } } } foreach ($filtered as $filtered_value) { $nodes = $filtered_value['nodes']; $nodes_count = count($nodes); $pattern = $filtered_value['pattern']; $id = $filtered_value['id']; if ($nodes_count > 3) { // needs to be changed if ($verbose) { Log::log("\tThere are " . $nodes_count . " nodes with " . strtoupper($id['name']) . " pattern <" . $id['value'] . "> eligible."); } $eligible[] = ['count' => $nodes_count, 'pattern' => $pattern, 'id_by' => $filtered_value['id_by'], 'nodes' => $nodes]; } } if ($verbose && $invalid > 0) { //Log::log( "\t".$invalid."/".$ncount." nodes invalidated by filters." ); // Needs to be corrected, because $invalid is incremented by 1 each $filter on each $identifier } if (count($eligible) > 0) { return true; } else { return false; } }
public static function crawl($source) { $crawler = new DomCrawler($source); Log::log("Generating tree.."); $node_tree = Parser::parse($crawler, 0, "", false); Log::log("Calculating components.."); $augmented_tree = Parser::augment($node_tree[0]); Log::log("Calculating path.."); $marked_tree = Parser::identify_parent_path($augmented_tree); file_put_contents("a-tree", print_r($marked_tree, true)); Log::log("Scanning the tree for a pattern.."); Scanner::scan($marked_tree); // Test di funzionamento delle funzioni get_node_from_path e get_parent_from_path //Log::log( Tree::get_node_from_path( $marked_tree, "0,0" )[ 'tag' ] ); //Log::log( Tree::get_parent_from_path( $marked_tree, "0,0" )[ 'tag' ] ); }
public static function connect() { $verbose = Config::$env == "dev" ? true : false; if ($verbose) { Log::log("<blue>Connecting to MySQL " . Config::$db_user . "@" . Config::$db_host . ".. "); } $mysqli = @new \mysqli(Config::$db_host, Config::$db_user, Config::$db_pass, Config::$db_name); if ($mysqli->connect_errno) { Log::log("<red>[" . $mysqli->connect_errno . "] " . $mysqli->connect_error); exit; } else { self::$conn = $mysqli; if ($verbose) { Log::log("<green>Connected."); } } }
public static function exec($url) { Log::log("Starting download of the url <" . $url . ">"); // Download the source $source = Downloader::get($url); if ($source == false) { Log::log("No source found. Exiting now."); exit; } Crawler::crawl((string) $source); /*$cat = []; if( @!!"" ){ echo "true"; }else{ echo "false"; }*/ }
public static function scan($node) { // The tweaker has to be applied BEFORE the scan Tweaker::apply(); $priority = Scanner::scan_for_pattern($node); Log::log("Tweaking results.."); //$priority = Tweaker::apply( $priority ); // Readable test $prio = []; foreach ($priority as $node) { unset($node['children']); unset($node['node']); $prio[] = $node; } $priority = $prio; // End readable test // Test Log::log("Printing results."); file_put_contents("priority", print_r($priority, true)); }
public static function get($url) { $client = new Guzzle(); $res = null; $tries = 0; $error = null; do { try { $res = $client->request('GET', $url, ['timeout' => 10, 'http_errors' => false]); } catch (\GuzzleHttp\Exception\ConnectException $e) { $error = "ConnectException: " . $e->getMessage(); } catch (\GuzzleHttp\Exception\RequestException $e) { $error = "RequestException: " . $e->getMessage(); } $tries++; } while ((!$res || $res->getStatusCode() == 200) && $tries < 10); if ($res == null) { Log::log($error); } return $res != null ? $res->getBody() : false; }
public static function deletion($priority) { for ($i = 0; $i < count($priority); $i++) { $node = $priority[$i]; $unset = false; foreach (self::$blacklist['parent'] as $parent) { if ($node['tag'] == $parent) { unset($priority[$i]); $unset = true; } } foreach (self::$blacklist['child'] as $child) { $old_children = $node['children']; $old_count = count($old_children); for ($a = 0; $a < count($old_children); $a++) { $node_child = $old_children[$a]; if (@(!!$node_child['tag']) && $node_child['tag'] == $child) { unset($old_children[$a]); } } if (!$unset) { $new_children = $old_children; if (count($new_children) > 0) { if (count($new_children) != $old_count) { Log::log("\tUnset " . ($old_count - count($new_children)) . " children from [" . $node['tag'] . "][" . ($i - 1) . "]"); } unset($priority[$i]['children']); $priority[$i]['children'] = $new_children; } else { Log::log("\tCompletely unset [" . $node['tag'] . "][" . ($i - 1) . "]"); unset($priority[$i]); } } } } return $priority; }
public function save_id() { $verbose = Config::$env == "dev" ? true : false; $id = null; if ($verbose) { Log::log("<blue>Creating the category `" . $this->name . "`.."); } $slug = Database::escape(strtolower(App::clean($this->name))); $query = "INSERT INTO `" . Config::$db_name . "`.`" . Config::$wp_prefix . "terms`( `name`, `slug`, `term_group` ) VALUE( '" . Database::escape($this->name) . "', '" . $slug . "', 0 )"; $exec = Database::exec($query); if ($exec === false) { Log::log("<red>Cannot save the category:"); Log::log("<red>[" . Database::$conn->errno . "] " . Database::$conn->error); Log::log("<red>" . $query); exit; } else { $term_id = Database::$conn->insert_id; if ($verbose) { Log::log("<green>Created the category `" . $this->name . "` with the slug `" . $slug . "` and ID '" . $term_id . "'."); } } $query = "INSERT INTO `" . Config::$db_name . "`.`" . Config::$wp_prefix . "term_taxonomy`( `term_id`, `taxonomy`, `description`, `parent`, `count`) VALUE( '" . $term_id . "', 'category', '', 0, 0 );"; $exec = Database::exec($query); if ($exec === false) { Log::log("<red>Cannot save the taxonomy:"); Log::log("<red>[" . Database::$conn->errno . "] " . Database::$conn->error); Log::log("<red>" . $query); exit; } else { if ($verbose) { Log::log("<green>Taxonomy created."); } $tax_id = Database::$conn->insert_id; } return [$term_id, $tax_id]; }
public static function get_archive($arc_key) { $verbose = Config::$env == "dev" ? true : false; Log::log("<blue>Downloading archive #" . $arc_key . " - " . self::$archives[$arc_key]['text'] . " - " . self::$archives[$arc_key]['href']); $crawler; $valid = false; do { $source = Downloader::get(self::$archives[$arc_key]['href']); //$source = Downloader::get( "http://localhost:8000" ); $crawler = new DomCrawler((string) $source); $crawler = $crawler->filter('.base_pagina_articoli > div > a'); $count = count($crawler); Log::log("<blue>Download ended."); if ($count == 0) { Log::log("<red>Found 0 articles. Why? Retrying to download."); } else { $valid = true; } sleep(1); } while (!$valid); if ($verbose) { Log::log("<green>The download was successfull."); Log::log("<cyan>Found " . $count . " posts in the archive."); } foreach ($crawler as $art_key => $domElement) { if ($art_key >= Config::$status_post) { if ($verbose) { Log::log("<cyan>Article ID #" . $art_key . " of archive #" . $arc_key . "."); } Config::$status_post = 0; $href = $domElement->getAttribute('href'); $text = trim($domElement->nodeValue); $text = explode("\n", $text); $title = \utf8_decode(htmlentities(trim($text[0]))); $date = [0, 0, 0, 0, 0, 0]; preg_match("#(\\d+)\\/(\\d+)\\/(\\d+)\\s\\-\\s(\\d+):(\\d+)#is", $text[1], $date); $date = Carbon::create($date[3], $date[2], $date[1], $date[4], $date[5]); if ($verbose) { Log::log("<blue>[" . $date . "] - " . $title . " - " . $href); } $valid = false; $article = null; $categories = []; do { $source = Downloader::get($href); $crawl = new DomCrawler((string) $source); preg_match("#articolo\\/(.+?)\\/\\d+#is", $href, $match); $name = trim($match[1]); if ($verbose) { Log::log("<darkGreen>" . $name); } $article = $crawl->filter('.dettagli_articoli_riassunto > p'); if (count($article) == 0) { Log::log("<red>Cannot get the article #" . $art_key . ". Retrying."); } else { foreach ($article as $art) { $article = \utf8_decode(htmlentities(trim($art->nodeValue))); //var_dump( mb_detect_encoding( $art->nodeValue ) ); if ($verbose) { Log::log("<darkGreen>" . $article); } break; } } $cats = $crawl->filter('.dettagli_articoli_famiglie > a'); $categories = []; foreach ($cats as $cat) { $cat = $cat->nodeValue; if (@(!!trim($cat))) { $categories[] = $cat; } } Log::log("<darkGreen>Categories: " . (@(!!$categories) ? "none" : implode(", ", $categories))); if (@(!!$article)) { $valid = true; } } while (!$valid); $post = new Post($date, $article, $title, $name, @(!!$categories) ? $categories : false); $post->save(); self::$count++; if ($verbose) { Log::log("<red>Count: " . self::$count); } if (Config::$post_end !== false && self::$count >= Config::$post_end) { Log::log("<red>Ended."); exit; } } } }
public static function dump($obj, $depth = 0, $exp = "") { if ($depth === true) { $exp = $depth; $depth = 0; } if (is_bool($exp) && $exp === true) { $exp = ""; } /*if( !$exp ) usleep( 10000 );*/ foreach ($obj as $key => $value) { $string = str_repeat(" ", $depth * 3); if (is_object($obj)) { $string .= "[" . $key . ":" . get_class($obj) . "] => "; } else { $string .= "[" . $key . "] => "; } if (is_object($value)) { $string .= get_class($value); if (!is_string($exp)) { $string = "<darkYellow>" . $string; } if (!is_string($exp)) { Log::log($string); } else { $exp .= PHP_EOL . $string; } $exp = self::dump($value, $depth + 1, $exp); } else { if (is_array($value)) { $string .= "Array"; if (@(!$value)) { $string .= "()"; if (!$exp) { $string = "<red>" . $string; } } else { if (!$exp) { $string = "<blue>" . $string; } } if (!is_string($exp)) { Log::log($string); } else { $exp .= PHP_EOL . $string; } $exp = self::dump($value, $depth + 1, $exp); } else { if (is_bool($value)) { if ($value) { if (!is_string($exp)) { $string .= "<darkGreen>bool(true)"; } else { $string .= "bool(true)"; } } else { if (!is_string($exp)) { $string .= "<darkGreen>bool(false)"; } else { $string .= "bool(false)"; } } } else { if (is_string($value)) { $string .= "string(" . $value . ")"; } } if (!is_string($exp)) { Log::log($string); } else { $exp .= PHP_EOL . $string; } } } } return $exp; }
<?php use Scraper\Middleware\Log; use Scraper\Controllers\Main; ini_set("max_execution_time", "900000"); require 'vendor/autoload.php'; Log::log("<red>Starting main controller."); Main::main();
public function save() { $verbose = Config::$env == "dev" ? true : false; $post_save_query = $this->get_query(); $post_saved = Database::exec($post_save_query); if ($post_saved === false) { Log::log("<red>Cannot save the article:"); Log::log("<red>[" . Database::$conn->errno . "] " . Database::$conn->error); Log::log("<red>" . $post_save_query); exit; } $this->post_id = Database::$conn->insert_id; if ($verbose) { Log::log("<green>Article with ID [" . $this->post_id . "] saved."); } foreach ($this->category as $cat_key => $cat) { $cat->check_exists(); $relation_query = "INSERT INTO `" . Config::$db_name . "`.`" . Config::$wp_prefix . "term_relationships` ( `object_id`, `term_taxonomy_id`, `term_order` ) VALUE ( " . $this->post_id . ", " . $cat->tax_id . ", 0 )"; $relation_exec = Database::exec($relation_query); if ($relation_exec === false) { Log::log("<red>Cannot save the link between the article and the taxonomy:"); Log::log("<red>[" . Database::$conn->errno . "] " . Database::$conn->error); Log::log("<red>" . $relation_query); exit; } $count_query = "UPDATE `" . Config::$db_name . "`.`" . Config::$wp_prefix . "term_taxonomy` SET count = count +1 WHERE `term_taxonomy_id` = " . $cat->tax_id . ";"; $count_exec = Database::exec($count_query); if ($count_exec === false) { Log::log("<red>Cannot add the article to the counter:"); Log::log("<red>[" . Database::$conn->errno() . "] " . Database::$conn->error); Log::log("<red>" . $count_query); exit; } } Log::log("<green>Article #" . $this->post_id . " saved with its categories."); }
public static function parse($node, $deep = 0, $xPath = "", $verbose = false) { $ret_obj = []; $spaces = str_repeat(" ", $deep * 5); $count = count($node); $elements_count = []; if ($count > 0) { foreach ($node as $domElement) { $node_obj = []; $nodeName = $domElement->nodeName; $nodeValue = $domElement->nodeValue; $tagName = isset($domElement->tagName) ? $domElement->tagName : false; $parentNode = $domElement->parentNode; $was_verbose = $verbose; if (Parser::get_id($domElement) == "g-items-atf") { //$verbose = true; } // Generation of an absolute xPath $myPath = ""; if (isset($tagName)) { if (!isset($elements_count[$tagName])) { $elements_count[$tagName] = 1; } else { $elements_count[$tagName]++; } $myPath = $xPath . ($xPath == "" ? "descendant-or-self::" : "/") . $tagName . (Parser::tags_count($node, $tagName) > 1 ? "[position()=" . $elements_count[$tagName] . "]" : ""); } // Logging node by name with a tree structure $log_string = "[" . $deep . "]" . (strlen($deep . "") < 2 ? " " : ""); if ($deep == 0) { $log_string .= $nodeName; } else { if ($deep > 1) { $tmp_string = "|"; $tdeep = 0; while ($tdeep < $deep - 1) { $tmp_string .= str_repeat(" ", 4) . "|"; $tdeep++; } $log_string .= $tmp_string; } else { if ($deep == 1) { $log_string .= "|"; } } $log_string .= str_repeat("-", 4) . $nodeName; } if (get_class($domElement) == "DOMElement" && @(!!$domElement->getAttribute('id'))) { $log_string .= "[#" . $domElement->getAttribute('id') . "]"; } //$log_string .= "[".get_class( $domElement )."]"; if ($verbose) { Log::log($log_string); } $node_obj = ['name' => $nodeName, 'value' => "", 'parent' => "", 'node' => $domElement]; if ($tagName !== false) { $node_obj['tag'] = $tagName; } $children = Parser::parse($domElement->childNodes, $deep + 1, $myPath, $verbose); if (!!$children) { $node_obj['children'] = $children; } if ($myPath != "") { $node_obj['xPath'] = $myPath; } $ret_obj[] = $node_obj; } return $ret_obj; } else { return false; } }