/** * 存储函数 * * @access public * @param string $url * @param string $data * @return void **/ public function save($url, $data) { if (!$url || !$data) { return false; } $url = trim($url, '/'); $first = stripos($url, '/'); $end = strripos($url, '/'); $tmp_url = $url; $last = 'index'; if (!($first == $end || $first == $end - 1)) { $last = strrchr($url, '/'); if ($last && strpos($last, '.')) { $tmp_url = substr($url, 0, $end); } } preg_match('/http:\\/\\/([^\\/]+)[\\/]?([a-zA-Z0-9\\/]*)/i', $tmp_url, $match); if (!$match) { return false; } $store = Loader::load_config('store'); $sub = isset($match[2]) && $match[2] ? $match[2] : ''; $sub = $sub ? trim($sub, '/') . '/' : ''; $path = $store['save_path'] . trim($match[1], '/') . '/' . $sub; $file = $path . trim($last, '/'); $content = is_object($data) ? $data->results : $data; check_path($path, 0777); @file_put_contents($file, serialize($content)); //务必调用否则不能触发解析程序 push_to_parser($url, $file); }
/** * 判断是否有用户自定义的业务逻辑 **/ public function custom_fetch($path, $ext) { $hooks = Loader::load_config('hooks', false); if ($hooks && $hooks['parse']) { Loader::load('parse.' . strtolower($hooks['parse']['class'])); $obj = new $hooks['parse']['class'](); $args = array($this, $path, $ext); call_user_func_array(array($obj, $hooks['parse']['method']), $args); } }
public static function soap_connect($command) { try { $soap = new SoapClient(NULL, array("location" => "http://" . Loader::load_config("soap_host") . ":" . Loader::load_config("soap_port"), "uri" => "urn:TC", "style" => SOAP_RPC, "login" => Loader::load_config("soap_user"), "password" => Loader::load_config("soap_pass"))); $soap->executeCommand(new SoapParam($command, "command")); return true; } catch (Exception $e) { return false; } }
public function connect($db_name) { try { $dsn = "mysql:host=" . Loader::load_config("db_host") . ";port=" . Loader::load_config("db_port") . ";dbname=" . $db_name; $this->db_conn = new PDO($dsn, Loader::load_config("db_user"), Loader::load_config("db_pass")); $this->db_conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); } catch (PDOException $e) { echo "DB ERROR: " . $e->getMessage(); } return $this->db_conn; }
/** * 判断是否有用户自定义的业务逻辑 **/ public function custom_fetch($url, $ext) { $hooks = Loader::load_config('hooks', false); if ($hooks && isset($hooks['crawl'])) { Loader::load('crawl.' . strtolower($hooks['crawl']['class'])); $obj = new $hooks['crawl']['class'](); $args = array($this, $url, $ext); call_user_func_array(array($obj, $hooks['crawl']['method']), $args); return true; } return false; }
public function init($config = array()) { $config = empty($config) ? Loader::load_config('db') : $config; if (empty($config)) { exit('db config is empty'); } static $_db = null; if ($_db) { return $_db; } $_db = mysql_connect($config['host'], $config['port']) or die('Could not connect to mysql server.'); mysql_select_db($config['dbname'], $_db) or die('Could not select database.'); mysql_query("SET NAMES {$config['charset']}"); return $_db; }
<?php // +---------------------------------------------------------------------- // | ThinkCrawler Framework [ I CAN DO IT JUST THINK IT ] // +---------------------------------------------------------------------- // | Copyright (c) 2011-2015 ThinkLei Team (http://www.smartlei.com) // +---------------------------------------------------------------------- // | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 ) // +---------------------------------------------------------------------- // | Author ThinkLei <*****@*****.**> // +---------------------------------------------------------------------- //抓取server Loader::load('crawl.Crawl'); //初始化一些全局变量 global $redis, $crawler_server, $crawler_monitor; $config = Loader::load_config('server'); $crawler_server = new swoole_server($config['crawler']['host'], $config['crawler']['port']); if (!$crawler_server) { exit('crawler_server connect failed'); } $crawler_server->set($config['crawler']['options']); //绑定一些事件及相应的回调函数 $crawler_server->on('start', function (swoole_server $crawler_server) { echo 'crawler_server start_time--' . date('Y-m-d H:i:s') . "\n"; echo "master_pid:{$crawler_server->master_pid}--manager_pid:{$crawler_server->manager_pid}\n"; echo 'version--[' . SWOOLE_VERSION . "]\n"; }); $crawler_server->on('workerStart', function (swoole_server $crawler_server, $worker_id) { global $argv; if ($worker_id >= $crawler_server->setting['worker_num']) { swoole_set_process_name("php {$argv[0]} task worker");
// +---------------------------------------------------------------------- // | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 ) // +---------------------------------------------------------------------- // | Author ThinkLei <*****@*****.**> // +---------------------------------------------------------------------- /** * 调度中心 * 1、协调crawler和parser工作 * 2、由于swoole的特殊性,所有是回调函数的形式 * 3、定义了监控器 * **/ //定义一些全局变量 global $crawler_monitor, $parser_monitor, $redis, $crawler_server, $parser_server, $crawler_topic, $parser_topic, $start; //加载site配置,里面包含了要抓取的网站列表及相应的配置,请看测试实例 $site = Loader::load_config('site'); if (!$site && !isset($site['urls']) && !$site['urls']) { exit('没有定义要抓取的网站列表'); } else { $urls = is_array($site['urls']) ? $site['urls'] : array($site['urls']); } //初始化redis $redis = init_redis(); $crawl_keys = $parse_keys = array(); //把要抓取的网站添加的要监控的队列 foreach ($urls as $url) { preg_match('/http:\\/\\/[^\\/]+[\\/]?/i', $url, $match); if (!$match) { continue; } $key = md5(trim($match[0], '/'));