Exemple #1
0
 /**
  * 存储函数 
  *
  * @access public 
  * @param  string $url 
  * @param  string $data 
  * @return void 
  **/
 public function save($url, $data)
 {
     if (!$url || !$data) {
         return false;
     }
     $url = trim($url, '/');
     $first = stripos($url, '/');
     $end = strripos($url, '/');
     $tmp_url = $url;
     $last = 'index';
     if (!($first == $end || $first == $end - 1)) {
         $last = strrchr($url, '/');
         if ($last && strpos($last, '.')) {
             $tmp_url = substr($url, 0, $end);
         }
     }
     preg_match('/http:\\/\\/([^\\/]+)[\\/]?([a-zA-Z0-9\\/]*)/i', $tmp_url, $match);
     if (!$match) {
         return false;
     }
     $store = Loader::load_config('store');
     $sub = isset($match[2]) && $match[2] ? $match[2] : '';
     $sub = $sub ? trim($sub, '/') . '/' : '';
     $path = $store['save_path'] . trim($match[1], '/') . '/' . $sub;
     $file = $path . trim($last, '/');
     $content = is_object($data) ? $data->results : $data;
     check_path($path, 0777);
     @file_put_contents($file, serialize($content));
     //务必调用否则不能触发解析程序
     push_to_parser($url, $file);
 }
Exemple #2
0
 /**
  * 判断是否有用户自定义的业务逻辑
  **/
 public function custom_fetch($path, $ext)
 {
     $hooks = Loader::load_config('hooks', false);
     if ($hooks && $hooks['parse']) {
         Loader::load('parse.' . strtolower($hooks['parse']['class']));
         $obj = new $hooks['parse']['class']();
         $args = array($this, $path, $ext);
         call_user_func_array(array($obj, $hooks['parse']['method']), $args);
     }
 }
Exemple #3
0
 public static function soap_connect($command)
 {
     try {
         $soap = new SoapClient(NULL, array("location" => "http://" . Loader::load_config("soap_host") . ":" . Loader::load_config("soap_port"), "uri" => "urn:TC", "style" => SOAP_RPC, "login" => Loader::load_config("soap_user"), "password" => Loader::load_config("soap_pass")));
         $soap->executeCommand(new SoapParam($command, "command"));
         return true;
     } catch (Exception $e) {
         return false;
     }
 }
Exemple #4
0
 public function connect($db_name)
 {
     try {
         $dsn = "mysql:host=" . Loader::load_config("db_host") . ";port=" . Loader::load_config("db_port") . ";dbname=" . $db_name;
         $this->db_conn = new PDO($dsn, Loader::load_config("db_user"), Loader::load_config("db_pass"));
         $this->db_conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
     } catch (PDOException $e) {
         echo "DB ERROR: " . $e->getMessage();
     }
     return $this->db_conn;
 }
Exemple #5
0
 /**
  * 判断是否有用户自定义的业务逻辑
  **/
 public function custom_fetch($url, $ext)
 {
     $hooks = Loader::load_config('hooks', false);
     if ($hooks && isset($hooks['crawl'])) {
         Loader::load('crawl.' . strtolower($hooks['crawl']['class']));
         $obj = new $hooks['crawl']['class']();
         $args = array($this, $url, $ext);
         call_user_func_array(array($obj, $hooks['crawl']['method']), $args);
         return true;
     }
     return false;
 }
Exemple #6
0
 public function init($config = array())
 {
     $config = empty($config) ? Loader::load_config('db') : $config;
     if (empty($config)) {
         exit('db config is empty');
     }
     static $_db = null;
     if ($_db) {
         return $_db;
     }
     $_db = mysql_connect($config['host'], $config['port']) or die('Could not connect to mysql server.');
     mysql_select_db($config['dbname'], $_db) or die('Could not select database.');
     mysql_query("SET NAMES {$config['charset']}");
     return $_db;
 }
<?php

// +----------------------------------------------------------------------
// | ThinkCrawler Framework [ I CAN DO IT JUST THINK IT ]
// +----------------------------------------------------------------------
// | Copyright (c) 2011-2015 ThinkLei Team (http://www.smartlei.com)
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author ThinkLei <*****@*****.**>
// +----------------------------------------------------------------------
//抓取server
Loader::load('crawl.Crawl');
//初始化一些全局变量
global $redis, $crawler_server, $crawler_monitor;
$config = Loader::load_config('server');
$crawler_server = new swoole_server($config['crawler']['host'], $config['crawler']['port']);
if (!$crawler_server) {
    exit('crawler_server connect failed');
}
$crawler_server->set($config['crawler']['options']);
//绑定一些事件及相应的回调函数
$crawler_server->on('start', function (swoole_server $crawler_server) {
    echo 'crawler_server start_time--' . date('Y-m-d H:i:s') . "\n";
    echo "master_pid:{$crawler_server->master_pid}--manager_pid:{$crawler_server->manager_pid}\n";
    echo 'version--[' . SWOOLE_VERSION . "]\n";
});
$crawler_server->on('workerStart', function (swoole_server $crawler_server, $worker_id) {
    global $argv;
    if ($worker_id >= $crawler_server->setting['worker_num']) {
        swoole_set_process_name("php {$argv[0]} task worker");
Exemple #8
0
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author ThinkLei <*****@*****.**>
// +----------------------------------------------------------------------
/**
 * 调度中心 
 * 1、协调crawler和parser工作 
 * 2、由于swoole的特殊性,所有是回调函数的形式
 * 3、定义了监控器
 *
 **/
//定义一些全局变量
global $crawler_monitor, $parser_monitor, $redis, $crawler_server, $parser_server, $crawler_topic, $parser_topic, $start;
//加载site配置,里面包含了要抓取的网站列表及相应的配置,请看测试实例
$site = Loader::load_config('site');
if (!$site && !isset($site['urls']) && !$site['urls']) {
    exit('没有定义要抓取的网站列表');
} else {
    $urls = is_array($site['urls']) ? $site['urls'] : array($site['urls']);
}
//初始化redis
$redis = init_redis();
$crawl_keys = $parse_keys = array();
//把要抓取的网站添加的要监控的队列
foreach ($urls as $url) {
    preg_match('/http:\\/\\/[^\\/]+[\\/]?/i', $url, $match);
    if (!$match) {
        continue;
    }
    $key = md5(trim($match[0], '/'));