function AddAndUpdateUrl($url, $action) { global $db; $spider = new spider(); $spider->url($url); $title = $spider->title; $fulltxt = $spider->fulltxt(800); $keywords = $spider->keywords; $description = $spider->description; $pagesize = $spider->pagesize; $array = array('url' => $url, 'title' => $title, 'fulltxt' => $fulltxt, 'pagesize' => $pagesize, 'keywords' => $keywords, 'description' => $description, 'updatetime' => time()); if ($action == "add") { $db->insert("kuaso_links", $array); } elseif ($action == "update") { $db->update("kuaso_links", $array, "url='" . $url . "'"); } }
function add_site($url) { global $db; $row = $db->get_one("select * from ve123_links where url='" . $url . "'"); if (empty($row)) { require_once PATH . "include/spider/spider_class.php"; $spider = new spider(); $spider->url($url); $title = $spider->title; $fulltxt = $spider->fulltxt(800); $keywords = $spider->keywords; $description = $spider->description; $pagesize = $spider->pagesize; $htmlcode = $spider->htmlcode; $array = array("url" => $url, "title" => $title, "fulltxt" => $fulltxt, "pagesize" => $pagesize, "keywords" => $keywords, "description" => $description, "updatetime" => time()); $db->insert("ve123_links", $array); } else { $array = array("updatetime" => time()); $db->update("ve123_links", $array, "url='" . $url . "'"); } }
function do_mpublish($pubArray = array()) { iPHP::$break = false; if ($_POST['pub']) { foreach ((array) $_POST['pub'] as $i => $a) { list($cid, $pid, $rid, $url, $title) = explode('|', $a); $pubArray[] = array('sid' => 0, 'url' => $url, 'title' => $title, 'cid' => $cid, 'rid' => $rid, 'pid' => $pid); } } if (empty($pubArray)) { iPHP::$break = true; iPHP::alert('暂无最新内容', 0, 30); } $_count = count($pubArray); ob_start(); ob_end_flush(); ob_implicit_flush(1); foreach ((array) $pubArray as $i => $a) { spider::$sid = $a['sid']; spider::$cid = $a['cid']; spider::$pid = $a['pid']; spider::$rid = $a['rid']; spider::$url = $a['url']; spider::$title = $a['title']; $rs = $this->multipublish(); $updateMsg = $i ? true : false; $timeout = $i++ == $_count ? '3' : false; iPHP::dialog($rs['msg'], 'js:' . $rs['js'], $timeout, 0, $updateMsg); ob_flush(); flush(); } iDB::update('spider_project', array('lastupdate' => time()), array('id' => $this->pid)); iPHP::dialog('success:#:check:#:采集完成!', 0, 3, 0, true); }
if (empty($site)) { $array = array('url' => $url, 'spider_depth' => $config["spider_depth"], 'indexdate' => time(), 'addtime' => time()); $db->insert("kuaso_sites", $array); } $site = $db->get_one("select * from kuaso_sites where url='{$url}'"); if (!empty($site)) { $ip = ip(); //$referer=$_SERVER['HTTP_REFERER']; $v = $db->get_one("select * from kuaso_stat_visitor where v_ip='" . $ip . "' and v_time>='" . (time() - 86400 * 1) . "'"); if (empty($v)) { $array = array('v_time' => time(), 'v_ip' => $ip); $db->insert("kuaso_stat_visitor", $array); $db->query("update kuaso_sites set com_time='" . time() . "',com_count_ip=com_count_ip+1 where url='" . $url . "'"); } } $site = $db->get_one("select * from kuaso_sites where url='{$url}'"); if (!empty($site)) { $row = $db->get_one("select * from kuaso_links where url='" . $url . "'"); if (empty($row)) { $spider = new spider(); $spider->url($url); $title = $spider->title; $fulltxt = $spider->fulltxt(800); $keywords = $spider->keywords; $description = $spider->description; $pagesize = $spider->pagesize; $array = array('url' => $url, 'title' => $title, 'fulltxt' => $fulltxt, 'pagesize' => $pagesize, 'keywords' => $keywords, 'description' => $description, 'updatetime' => time()); $db->insert("kuaso_links", $array); } } }
public static function remote($url, $_count = 0) { $url = str_replace('&', '&', $url); if (empty(spider::$referer)) { $uri = parse_url($url); spider::$referer = $uri['scheme'] . '://' . $uri['host']; } $options = array(CURLOPT_URL => $url, CURLOPT_ENCODING => spider::$encoding, CURLOPT_REFERER => spider::$referer, CURLOPT_USERAGENT => spider::$useragent, CURLOPT_TIMEOUT => 10, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FAILONERROR => 1, CURLOPT_HEADER => 0, CURLOPT_NOSIGNAL => true, CURLOPT_DNS_USE_GLOBAL_CACHE => true, CURLOPT_DNS_CACHE_TIMEOUT => 86400, CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false); spider::$cookie && ($options[CURLOPT_COOKIE] = spider::$cookie); if (spider::$curl_proxy) { $proxy = spiderTools::proxy_test(); $proxy && ($options = spiderTools::proxy($options, $proxy)); } $ch = curl_init(); curl_setopt_array($ch, $options); $responses = curl_exec($ch); $info = curl_getinfo($ch); if (spider::$dataTest || spider::$ruleTest) { echo "<b>{$url} 头信息:</b><pre>"; print_r($info); echo '</pre><hr />'; if ($_GET['breakinfo']) { exit; } } if (in_array($info['http_code'], array(301, 302)) && $_count < 5) { $_count++; $newurl = $info['redirect_url']; if (empty($newurl)) { curl_setopt($ch, CURLOPT_HEADER, 1); $header = curl_exec($ch); preg_match('|Location: (.*)|i', $header, $matches); $newurl = ltrim($matches[1], '/'); if (empty($newurl)) { return false; } if (!strstr($newurl, 'http://')) { $host = $uri['scheme'] . '://' . $uri['host']; $newurl = $host . '/' . $newurl; } } $newurl = trim($newurl); curl_close($ch); unset($responses, $info); return spiderTools::remote($newurl, $_count); } if (in_array($info['http_code'], array(404, 500))) { curl_close($ch); unset($responses, $info); return false; } if ((empty($responses) || $info['http_code'] != 200) && $_count < 5) { $_count++; if (spider::$dataTest || spider::$ruleTest) { echo $url . '<br />'; echo "获取内容失败,重试第{$_count}次...<br />"; } curl_close($ch); unset($responses, $info); return spiderTools::remote($url, $_count); } $pos = stripos($info['content_type'], 'charset='); $pos !== false && ($content_charset = trim(substr($info['content_type'], $pos + 8))); $responses = spiderTools::charsetTrans($responses, $content_charset, spider::$charset); curl_close($ch); unset($info); if (spider::$dataTest || spider::$ruleTest) { echo '<pre>'; print_r(htmlspecialchars(substr($responses, 0, 800))); echo '</pre><hr />'; } spider::$url = $url; return $responses; }
function insert_links($url) { global $db, $config; $spider = new spider(); $spider->url($url); $links = $spider->links(); $sites = $spider->sites(); foreach ($sites as $value) { $site_url = GetSiteUrl($link); $site = $db->get_one("select * from ve123_sites where url='" . $site_url . "'"); $site_id = $site["site_id"]; $row = $db->get_one("select * from ve123_links where url='" . $value . "'"); if (empty($row) && is_url($value)) { echo $value . "<br>"; $array = array('url' => $value, 'site_id' => $site_id, 'level' => '0'); $db->insert("ve123_links", $array); } else { echo "已存在:" . $value . "<br>"; } ob_flush(); flush(); //sleep(1); $row = $db->get_one("select * from ve123_sites where url='" . $value . "'"); if (empty($row) && is_url($value)) { $array = array('url' => $value, 'spider_depth' => $config["spider_depth"], 'addtime' => time()); $db->insert("ve123_sites", $array); } } //sleep(1); foreach ($links as $value) { $row = $db->get_one("select * from ve123_links_temp where url='" . $value . "'"); if (empty($row) && is_url($value)) { $array = array('url' => $value); $db->insert("ve123_links_temp", $array); } } }
function Update_link($url) { global $db, $bug_url; $is_success = FALSE; $is_shoulu = FALSE; $spider = new spider(); $spider->url($url); $title = $spider->title; $fulltxt = $spider->fulltxt(800); $pagesize = $spider->pagesize; $keywords = $spider->keywords; $htmlcode = $spider->htmlcode; $description = $spider->description; $site_url = GetSiteUrl($url); $site = $db->get_one("select * from ve123_sites where url='" . $site_url . "'"); $site_id = $site["site_id"]; echo $title; $array = array('title' => $title, 'fulltxt' => $fulltxt, 'pagesize' => $pagesize, 'keywords' => $keywords, 'description' => $description, 'site_id' => $site_id); $db->query("update ve123_links set updatetime='" . time() . "' where url='" . $url . "'"); if (!empty($title)) { $s = array(); $s = explode("?", $title); if ($pagesize > 1 && count($s) < 2) { $domain = GetSiteUrl($url); $site = $db->get_one("select * from ve123_sites where url='" . $domain . "'"); if (!empty($site)) { if (!empty($site["include_word"])) { foreach (explode(",", $site["include_word"]) as $value) { if (stristr($htmlcode, $value)) { $include_num += 1; } } if ($include_num <= 0) { $is_shoulu = FALSE; } } else { $is_shoulu = TRUE; } if (!empty($site["not_include_word"])) { foreach (explode(",", $site["not_include_word"]) as $value) { if (stristr($htmlcode, $value)) { $not_include_num += 1; } } if ($not_include_num > 0) { $is_shoulu = FALSE; } } } else { $is_shoulu = TRUE; } if ($is_shoulu) { $db->update("ve123_links", $array, "url='" . $url . "'"); //file_put_contents(PATH."k/www/".str_replace("http://","",$url.".html"),$htmlcode); $is_success = TRUE; } } } if (empty($bug_url)) { exit; } return $is_success; }