コード例 #1
0
function RefurlCookie($gurl){
	global $gcookie,$lastRfurl;
	$gurl = trim($gurl);
	if(!empty($gcookie) && $lastRfurl==$gurl) return $gcookie;
	else $lastRfurl=$gurl;
	if(trim($gurl)=='') return '';
	$urlinfos = GetHostInfo($gurl);
  $ghost = $urlinfos['host'];
  $gquery = $urlinfos['query'];
  $sessionQuery = "GET $gquery HTTP/1.1\r\n";
  $sessionQuery .= "Host: $ghost\r\n";
  $sessionQuery .= "Accept: */*\r\n";
  $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
  $sessionQuery .= "Connection: Close\r\n\r\n";
  $errno = "";
  $errstr = "";
  $m_fp = fsockopen($ghost, 80, $errno, $errstr,10) or die($ghost.'<br />');
  fwrite($m_fp,$sessionQuery);
  $lnum = 0;
  //获取详细应答头
  $gcookie = "";
	while(!feof($m_fp)){
			$line = trim(fgets($m_fp,256));
			if($line == "" || $lnum>100) break;
			else{
				if(eregi("^cookie",$line)){
					$gcookie = $line;
					break;
				}
			}
	 }
   fclose($m_fp);
   return $gcookie;
}
コード例 #2
0
ファイル: function.spider.php プロジェクト: edmundwong/V604
function cloud_match_rules($get_type, $url, $content)
{
    global $_G;
    pload('F:fastpick');
    $setting = get_pick_set();
    $pick_config = $_G['cache']['evn_milu_pick']['pick_config'];
    $server_cache_time = $pick_config['index_server_cache_time'];
    if ($get_type == '3') {
        //智能学习规则索引过期时间比较短
        $server_cache_time = $pick_config['evo_index_server_cache_time'];
    }
    $milu_set = pick_common_get();
    if ($setting['open_cloud_pick'] != 1) {
        return FALSE;
    }
    pload('F:copyright');
    $host_info = GetHostInfo($url);
    $domain = $host_info['host'];
    $domain_hash = md5($domain);
    $url_temp = preg_replace('/\\d+/', '', $url);
    $arr_temp = parse_url($url_temp);
    $path_hash = md5($arr_temp['path']);
    $over_dateline = $_G['timestamp'] - $server_cache_time;
    $count = DB::result(DB::query("SELECT COUNT(*) FROM " . DB::table('strayer_searchindex') . " WHERE  domain_hash='" . $domain_hash . "' AND path_hash='" . $path_hash . "' AND type='" . $get_type . "3' AND dateline > {$over_dateline}"), 0);
    //3是服务端 4是本地的缓存
    if ($count) {
        return FALSE;
    }
    $args = array('get_type' => $get_type, 'url' => $url);
    $rpcClient = rpcClient();
    $client_info = get_client_info();
    $re = $rpcClient->cloud_match_rules($args, $client_info);
    if (is_object($re) || $re->Number == 0) {
        if ($re->Message) {
            return milu_lang('phprpc_error', array('msg' => $re->Message));
        }
        $re = (array) $re;
    }
    $data = array();
    if ($re['data_type'] == 1) {
        //返回规则
        $rules_info = $re['data'];
        if ($get_type == 3) {
            $data = evo_rules_get_article($content, $rules_info);
        } else {
            $data = rules_get_article($content, $rules_info);
        }
        if ($data || $data['content'] && $get_type == 3) {
            //规则验证有效,下载到本地
            if ($get_type == 3) {
                $data_id = import_evo_data($rules_info);
            } else {
                $data_id = import_fastpick_data($rules_info);
            }
            if ($data_id) {
                //先清除之前的索引
                DB::query('DELETE FROM ' . DB::table('strayer_searchindex') . " WHERE domain_hash='" . $domain_hash . "' AND path_hash='" . $path_hash . "'");
                add_search_index($domain_hash, $path_hash, $get_type . '4', $data_id);
                //添加索引
            }
        }
    } else {
        if ($re['data_type'] == 2) {
            //返回内容
            $data = $re['data'];
        } else {
            //一无所获,那也要告诉客户端,别再骚扰服务端了
            add_search_index($domain_hash, $path_hash, $get_type . '3', 0);
        }
    }
    return $data;
}
コード例 #3
0
 function evo_set($info)
 {
     global $_G;
     if (!$info) {
         return;
     }
     if (strlen($info['text']['html']) < 200 || strlen($info['title']['html']) < 10) {
         return;
     }
     //标题和内容太短都不行
     $link_count = own_link_count($info['text']['html'], $this->url);
     if ($link_count > 10) {
         return FALSE;
     }
     //有10个指向自己的链接,就不行
     $milu_set = pick_common_get();
     if ($milu_set['fp_open_evo'] != 1) {
         return FALSE;
     }
     $text_info = $this->dom_info_arr[$info['text']['key']];
     $title_info = $this->dom_info_arr[$info['title']['key']];
     $text_info['html'] = $info['text']['html'];
     $title_info['html'] = $info['title']['html'];
     $info['title_split_arr'] = $this->get_split_arr($title_info);
     $info['text_split_arr'] = $split_arr = $this->get_split_arr($text_info);
     unset($text_info['outertext'], $text_info['parent']['outertext'], $title_info['outertext'], $title_info['parent']['outertext'], $text_info['html'], $title_info['html']);
     if (strlen($split_arr[0]) < 14) {
         return FALSE;
     }
     pload('F:copyright');
     $host_info = GetHostInfo($this->url);
     $domain = $host_info['host'];
     $domain_hash = md5($domain);
     if (preg_match('/\\d+/', $split_arr[0])) {
         $s_arr = preg_split("/[\\d]+/", $split_arr[0]);
         $split_arr[0] = $s_arr[0];
         foreach ((array) $s_arr as $k => $v) {
             if (strlen($v) > strlen($split_arr[0])) {
                 $split_arr[0] = $v;
             }
         }
     }
     if (!$title_info) {
         return FALSE;
     }
     $result_info['evo_title_info'] = $title_info;
     $setarr = array('content_get_type' => 0, 'detail_ID' => $split_arr[0], 'detail_ID_hash' => md5($split_arr[0]), 'detail_ID_test' => $this->url, 'content_rules' => '', 'evo_text_info' => serialize($text_info), 'evo_title_info' => serialize($title_info), 'domain_hash' => $domain_hash, 'domain' => $domain, 'status' => 0, 'dateline' => $_G['timestamp']);
     $setarr = paddslashes($setarr);
     $base_sql = "SELECT * FROM " . DB::table('strayer_evo') . " WHERE domain_hash='{$domain_hash}' AND detail_ID_hash='" . $setarr['detail_ID_hash'] . "' AND status=0";
     $data_info = DB::fetch_first($base_sql . " AND detail_ID_test!='{$this->url}'");
     $data_info = pstripslashes($data_info);
     if (!$data_info) {
         //还没有资料
         if (!($check = DB::result(DB::query("SELECT COUNT(*) FROM " . DB::table('strayer_evo') . " WHERE domain_hash='{$domain_hash}' AND detail_ID_hash='" . $setarr['detail_ID_hash'] . "' AND status=0 AND detail_ID_test='{$this->url}'"), 0))) {
             DB::insert('strayer_evo', $setarr, TRUE);
         }
         $result_info['status'] = 'no';
         return $result_info;
     } else {
         //有了资料
         $title_rules = $this->get_rules($info, $title_info, $data_info, 'title');
         $text_rules = $this->get_rules($info, $text_info, $data_info, 'text');
         //删除之前的一些记录,防止没有索引的情况下重复生成规则
         $check_info = DB::fetch_first("SELECT * FROM " . DB::table('strayer_evo') . " WHERE domain_hash='{$domain_hash}' AND detail_ID_hash='" . $setarr['detail_ID_hash'] . "' AND status=1");
         DB::query('DELETE FROM ' . DB::table('strayer_evo') . " WHERE id='{$check_info['id']}'");
         DB::query('DELETE FROM ' . DB::table('strayer_searchindex') . " WHERE id='{$check_info['id']}' AND type='34'");
         if ($text_rules) {
             $setarr = array('content_get_type' => $text_rules['get_type'], 'content_rules' => $text_rules['rules'], 'theme_get_type' => $title_rules['get_type'], 'theme_rules' => $title_rules['rules'], 'status' => 1);
             DB::update("strayer_evo", $setarr, array("id" => $data_info['id']));
             $pash_hash = get_path_hash($this->url);
             add_search_index($domain_hash, $path_hash, 34, $data_info['id']);
             //添加索引 4是本地缓存
             $pick_set = get_pick_set();
             if ($pick_set['open_cloud_pick'] == 1) {
                 //开启云采集,将规则上传到服务端
                 $rpcClient = rpcClient();
                 unset($setarr['status']);
                 $data_info['content_get_type'] = $setarr['content_get_type'];
                 $data_info['content_rules'] = $setarr['content_rules'];
                 $data_info['theme_get_type'] = $setarr['theme_get_type'];
                 $data_info['theme_rules'] = $setarr['theme_rules'];
                 $client_info = get_client_info();
                 $re = $rpcClient->upload_evo_data($data_info, $client_info);
             }
             del_search_index(3);
             $result_info['status'] = 'ok';
             return $result_info;
         }
     }
 }