public function spiderUnitTest() { $object = new spider(); $expected = array("5", "7", "Right"); $actual = $object->spider(20, 20, 4, 10, Left, FLFLFRFFLF); $this->assertSame(array_diff($expected, $actual), array_diff($actual, $expected)); }
function spiderform_action() { $x = isset($_POST['x']) ? $_POST['x'] : ''; $y = isset($_POST['y']) ? $_POST['y'] : ''; $m = isset($_POST['m']) ? $_POST['m'] : ''; $n = isset($_POST['n']) ? $_POST['n'] : ''; $path = isset($_POST['path']) ? $_POST['path'] : ''; $dir_str = isset($_POST['dir']) ? $_POST['dir'] : ''; $spider = new spider($x, $y, $m, $n, $dir_str, $path); $spider->spider($x, $y, $m, $n, $dir_str, $path); }
function docaiji() { set_time_limit(0); import('ORG.Util.Spider'); $islocal = intval($_POST['islocal']); $list_url = trim($_POST['url_list']); $charset = trim($_POST['charset']); $page_url = trim($_POST['page_list']); $act = intval($_POST['act']); $field = $_POST['field']; $field[] = 'typeid'; $role = $_POST['role']; $role[] = $_POST['typeid']; $spider = new spider(); //支持单页或多页采集 $spider->islocal = $islocal; $spider->addStartUrl($list_url); $spider->setCharset($charset); $spider->addLayer(0, 'list', $page_url); for ($i = 0; $i < count($field); $i++) { $spider->addField($field[$i], $role[$i]); } $spider->run(); $spider->output(); $file = $_SERVER['DOCUMENT_ROOT'] . '/dami_caiji.sql'; $spider->saveSql('dami_article', $file, $act); }
function AddAndUpdateUrl($url, $action) { global $db; $spider = new spider(); $spider->url($url); $title = $spider->title; $fulltxt = $spider->fulltxt(800); $keywords = $spider->keywords; $description = $spider->description; $pagesize = $spider->pagesize; $array = array('url' => $url, 'title' => $title, 'fulltxt' => $fulltxt, 'pagesize' => $pagesize, 'keywords' => $keywords, 'description' => $description, 'updatetime' => time()); if ($action == "add") { $db->insert("kuaso_links", $array); } elseif ($action == "update") { $db->update("kuaso_links", $array, "url='" . $url . "'"); } }
protected function checkLink($uri, $link) { $link = spider::absolutePath($link, $uri); if ($contents = @file_get_contents($link)) { // All ok. } else { echo "{$uri} => {$link} is a broken link!<br>"; } }
function add_site($url) { global $db; $row = $db->get_one("select * from ve123_links where url='" . $url . "'"); if (empty($row)) { require_once PATH . "include/spider/spider_class.php"; $spider = new spider(); $spider->url($url); $title = $spider->title; $fulltxt = $spider->fulltxt(800); $keywords = $spider->keywords; $description = $spider->description; $pagesize = $spider->pagesize; $htmlcode = $spider->htmlcode; $array = array("url" => $url, "title" => $title, "fulltxt" => $fulltxt, "pagesize" => $pagesize, "keywords" => $keywords, "description" => $description, "updatetime" => time()); $db->insert("ve123_links", $array); } else { $array = array("updatetime" => time()); $db->update("ve123_links", $array, "url='" . $url . "'"); } }
function add_url() { if ($url = $this->input->post('url')) { // 抓取页面,分析提取页面的标题 $sp = new spider(); $data['url'] = $url; $data['fetched_info'] = $sp->fetch_info($url); // A PHP Error was encountered // Severity: Notice // Message: Undefined index: keywords // Filename: controllers/user.php // Line Number: 57 // A PHP Error was encountered // Severity: Notice // Message: Undefined index: description // Filename: controllers/user.php // Line Number: 58 } $data['title'] = '收藏新网页'; $data['main_content'] = 'create_mark_form'; $this->load->view('includes/template', $data); }
require_once 'modules/threadfinder.php'; // clean up input $errors = array(); if (preg_match('/^[A-Za-z0-9-_]+$/', $_POST['boardname'])) { $boardname = trim($_POST['boardname']); } else { $errors[] = 'Invalid Board Name'; } if (preg_match('/^[A-Za-z0-9-_]+$/', $_POST['keyword'])) { $keyword = trim($_POST['keyword']); } else { $errors[] = 'Invalid Keyword'; } if (empty($errors)) { // initate objects $spider = new spider(); $threadFinder = new threadFinder(); // parameters $board = $boardname; $searchkeyword = $keyword; // get threads from specified board $threadFinder->threads = $spider->getThreads($board); $threadsFound = $threadFinder->getBySubject($searchkeyword); if ($threadsFound) { // processing if (count($threadsFound) == 1) { // just one, we got what we need! $threadID = $threadsFound[0]; $threadLink = 'http://boards.4chan.org/' . $board . '/thread/' . $threadID; } else { if (count($threadsFound) > 1) {
?> </th> </tr> </thead> <tbody class="spider-list" id="spider-list-<?php echo md5($furl); ?> "> <?php foreach ($lists as $lkey => $row) { list($_title, $_url) = spiderTools::title_url($row, $rule, $furl); if ($_url === false) { continue; } $hash = md5($_url); if (spider::checker($work, $pid, $_url, $_title) === true) { ?> <tr id="<?php echo $hash; ?> "> <td><input type="checkbox" name="pub[]" value="<?php echo $cid; ?> |<?php echo $pid; ?> |<?php echo $rid; ?> |<?php
?> </th> </tr> </thead> <tbody class="spider-list" id="spider-list-<?php echo md5($furl); ?> "> <?php foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $furl); if (spider::$url === false) { continue; } $hash = md5(spider::$url); if (spider::checker($work) === true) { ?> <tr id="<?php echo $hash; ?> "> <td><input type="checkbox" name="pub[]" value="<?php echo $cid; ?> |<?php echo $pid; ?> |<?php echo $rid; ?> |<?php
/** * Checks that the start or end html is included within the main container div * @param string $input The value for URL set by the user * @return */ public function check_end_html_included($input) { $spider = new spider($this->domain, $this->URL); $HTML_block = $spider->get_main_html_block($this->HTML, get_option('mainHTMLBlock'), $this->domain . $this->URL); if ($input != '') { if (strpos($HTML_block, $input) === false) { add_settings_error('endHTML', esc_attr('settings_updated'), 'Unable to find the end HTML with the main HTML block', 'error'); } else { add_settings_error('endHTML', 'settings_updated', 'Found within the main HTML block', 'updated'); } } return apply_filters('check_html_included', $input, $input); }
public static function proxy_test() { $options = array(CURLOPT_URL => 'http://www.baidu.com', CURLOPT_REFERER => 'http://www.baidu.com', CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)', CURLOPT_TIMEOUT => 10, CURLOPT_CONNECTTIMEOUT => 8, CURLOPT_RETURNTRANSFER => 1, CURLOPT_HEADER => 0, CURLOPT_NOSIGNAL => true, CURLOPT_DNS_USE_GLOBAL_CACHE => true, CURLOPT_DNS_CACHE_TIMEOUT => 86400, CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false); if (empty(spider::$proxy_array)) { if (empty(spider::$curl_proxy)) { return false; } spider::$proxy_array = explode("\n", spider::$curl_proxy); // socks5://127.0.0.1:1080@username:password } if (empty(spider::$proxy_array)) { return false; } $rand_keys = array_rand(spider::$proxy_array, 1); $proxy = spider::$proxy_array[$rand_keys]; $proxy = trim($proxy); $options = spiderTools::proxy($options, $proxy); $ch = curl_init(); curl_setopt_array($ch, $options); curl_exec($ch); $info = curl_getinfo($ch); curl_close($ch); if ($info['http_code'] == 200) { return $proxy; } else { unset(spider::$proxy_array[$rand_keys]); return spiderTools::proxy_test(); } }
function create() { if (in_array("b", $this->type) || in_array("l", $this->type)) { for ($bar = $i = 0; $i < count($this->type); $i++) { if ($this->type[$i] == 'b') { $bar += 1; } } $this->disbar = $this->larg * $bar; $this->ld = $this->larg + $this->disbar; # variabile di comodo # if (in_array("l", $this->type) && $this->disbar == 0) { $this->disbar = 2 * $this->larg; $this->ld = $this->disbar; # variabile di comodo # } if (!isset($this->mass)) { $this->mass = $this->mx; } if (!isset($this->mnvs)) { $this->mnvs = $this->mn; } if (isset($this->name)) { graidle::setLegend($this->name); } if (!isset($this->dvx)) { if ($this->mass <= 1) { $this->dvx = round($this->mass / 5, 1); } else { if ($this->mass > 1 && $this->mass < 10) { $this->dvx = 1; } else { $this->dvx = round($this->mass / 10); } } } if (!isset($this->AA)) { $this->AA = 2; } if ($this->mx > 0) { if ($this->mass == $this->mx) { $this->scarmax = 1; } else { $this->scarmax = $this->mass - $this->mx; } } $this->scarmin = $this->mn; if ($this->mn < 0) { if ($this->mnvs > 0 || !isset($this->mnvs)) { $this->scarmin = 0; } else { if ($this->mnvs > $this->mn || $this->mnvs < $this->mn) { $this->scarmin = $this->mnvs - $this->mn; } else { $this->scarmin = -1; } } } if (strlen($this->mn) > strlen($this->mx)) { $this->y_flag = strlen($this->mn); } else { $this->y_flag = strlen($this->mx); } $this->s += $this->font_small * graidle::stringLen($this->mass); if (!isset($this->w)) { $this->w = $this->ld * $this->cnt + $this->s + $this->d; if ($this->w < 640) { while ($this->w < 640) { $this->larg += 0.01; $this->disbar = $this->larg * $bar; $this->ld = $this->larg + $this->disbar; $this->w = round($this->ld * $this->cnt) + $this->s + $this->d; } } else { while ($this->w > 641) { $this->larg -= 0.01; $this->disbar = $this->larg * $bar; $this->ld = $this->larg + $this->disbar; $this->w = $this->ld * $this->cnt + $this->s + $this->d; } } } else { while ($this->ld * $this->cnt + $this->s + $this->d >= $this->w) { $this->larg -= 0.01; $this->disbar = $this->larg; $this->ld = $this->larg + $this->disbar; } while ($this->ld * $this->cnt + $this->s + $this->d <= $this->w) { $this->larg += 0.01; $this->disbar = $this->larg; $this->ld = $this->larg + $this->disbar; } } if (!isset($this->h)) { $this->h = round(3 / 4 * $this->w); } $this->b += 2 * $this->font_small; if ($this->mnvs > 0 && $this->mass > 0) { $this->mul = ($this->h - $this->a - $this->b) / ($this->mass - $this->mnvs); } else { $this->mul = ($this->h - $this->a - $this->b) / ($this->mass + $this->scarmax + (abs($this->mn) - $this->scarmin)); } $this->div = $this->dvx * $this->mul; $this->im = imagecreatetruecolor($this->w, $this->h); $rgb = Color::hex2rgb($this->axis_color); $this->axis_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]); $rgb = Color::hex2rgb($this->font_color); $this->font_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]); $rgb = Color::hex2rgb($this->bg_color); $this->bg_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]); imagefilltoborder($this->im, 1, 1, 1, $this->bg_color); if (isset($this->legend) || isset($this->name)) { graidle::legend(); } graidle::title($this->title, $this->xAxis, $this->yAxis); graidle::gradAxis($this->sx, $this->sy); if (in_array("b", $this->type)) { include "graidle_histo.ext.php"; histogram::drawHisto(); } graidle::drawAxis(); if (in_array("l", $this->type)) { include "graidle_line.ext.php"; line::drawLine(); } } else { if (in_array("hb", $this->type)) { for ($bar = $i = 0; $i < count($this->type); $i++) { if ($this->type[$i] == 'hb') { $bar += 1; } } $this->disbar = $this->larg * $bar; if (isset($this->name)) { graidle::setLegend($this->name); } if (!isset($this->mass)) { $this->mass = $this->mx; } if (!isset($this->mnvs)) { $this->mnvs = $this->mn; } if (!isset($this->dvx)) { if ($this->mass <= 1) { $this->dvx = round($this->mass / 5, 1); } else { if ($this->mass > 1 && $this->mass < 10) { $this->dvx = 1; } else { $this->dvx = round($this->mass / 10); } } } if (!isset($this->AA)) { $this->AA = 4; } $this->b += 5 * $this->font_small; $this->d += round(graidle::StringLen($this->mass) * ($this->font_small / 4)); if (isset($this->vlx)) { for ($maxlen = $i = 0; $i <= count($this->vlx); $i++) { if (isset($this->vlx[$i])) { $curlen = graidle::stringlen($this->vlx[$i]) * $this->font_small; if ($maxlen < $curlen) { $maxlen = $curlen; } } } $this->s += $maxlen + 10; } else { $this->s += $this->font_small * 4; } if (isset($this->yAxis)) { $this->s += 2 * $this->fontsmall; } $this->ld = $this->larg + $this->disbar; # variabile di comodo # if (!isset($this->h)) { $this->h = $this->ld * $this->cnt + $this->a + $this->b; if ($this->h < 500) { while ($this->h < 500) { $this->larg += 0.01; $this->disbar = $this->larg * $bar; $this->ld = $this->larg + $this->disbar; $this->h = round($this->ld * $this->cnt) + $this->a + $this->b; } } else { while ($this->h > 501) { $this->larg -= 0.01; $this->disbar = $this->larg * $bar; $this->ld = $this->larg + $this->disbar; $this->h = $this->ld * $this->cnt + $this->a + $this->b; } } } else { while ($this->ld * $this->cnt + $this->a + $this->b <= $this->h) { $this->larg += 0.01; $this->disbar = $this->larg * $bar; $this->ld = $this->larg + $this->disbar; } while ($this->ld * $this->cnt + $this->a + $this->b >= $this->h) { $this->larg -= 0.01; $this->disbar = $this->larg * $bar; $this->ld = $this->larg + $this->disbar; } } if (!isset($this->w)) { $this->w = round(4 / 5 * $this->h); } if ($this->mnvs > 0 && $this->mass > 0) { $this->mul = ($this->w - $this->s - $this->d) / ($this->mass - $this->mnvs); } else { $this->mul = ($this->w - $this->s - $this->d) / ($this->mass + abs($this->mnvs)); } $this->im = imagecreatetruecolor($this->w, $this->h); $rgb = Color::hex2rgb($this->axis_color); $this->axis_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]); $rgb = Color::hex2rgb($this->font_color); $this->font_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]); $rgb = Color::hex2rgb($this->bg_color); $this->bg_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]); imagefilltoborder($this->im, 1, 1, 1, $this->bg_color); if (isset($this->legend) || isset($this->name)) { graidle::legend(); } include "graidle_horizhisto.ext.php"; HorizHistogram::gradAxis($this->sx, $this->sy); HorizHistogram::drawHorizHisto(); HorizHistogram::drawAxis(); graidle::title($this->title, $this->xAxis, $this->yAxis); } else { if (in_array("p", $this->type)) { include "graidle_pie.ext.php"; for ($this->pie = $i = 0; $i < count($this->type); $i++) { if ($this->type[$i] == 'p') { $this->pie += 1; } } if (!isset($this->incl)) { $this->incl = 55; } if (!isset($this->AA)) { $this->AA = 4; } if (!isset($this->w)) { $this->w = 500; } if (!isset($this->h)) { $this->h = 500; } $this->tre_d = 0; if ($this->incl < 90) { $this->tre_d = round($this->incl / 5); } $this->radius = $this->w; $e = sin(deg2rad($this->incl)); $rapp = pow($e, 2); $a = $this->radius; $b = $a * $rapp; while ($a >= $this->w - $this->s - $this->d) { $a -= 1; $this->radius = $a; $b = $a * $rapp; } while ($b * $this->pie > $this->h - $this->a - $this->pie * $this->b - $this->pie * $this->tre_d) { $b -= 1; $a = $b / $rapp; $this->radius = $a; } $this->im = imagecreatetruecolor($this->w, $this->h); #<----CREO L'IMMAGINE PER IL GRAFICO A TORTA $rgb = Color::hex2rgb($this->bg_color); $this->bg_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]); imagefilltoborder($this->im, 1, 1, 1, $this->bg_color); #<---- Creo lo sfondo $rgb = Color::hex2rgb($this->font_color); $this->font_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]); if (isset($this->legend)) { graidle::legend(); } graidle::title($this->title); pie::drawPie($a, $b); } else { if (in_array("s", $this->type)) { include "graidle_spider.ext.php"; if (!isset($this->mass)) { $this->mass = $this->mx; } if (!isset($this->filled)) { $this->filled = 1; } if (!isset($this->AA)) { $this->AA = 4; } if (!isset($this->w)) { if (isset($this->h)) { $this->w = round($this->h * (5 / 4)); } else { $this->w = 500; } } if (!isset($this->h)) { $this->h = round($this->w * (4 / 5)); } if (isset($this->name)) { graidle::setLegend($this->name); } if (!isset($this->dvx)) { if ($this->mass / 10 < 1) { $this->dvx = round($this->mass / 5, 1); } else { $this->dvx = round($this->mass / 10); } } $this->radius = $this->w - $this->s - $this->d; while ($this->radius >= $this->h - $this->a - $this->b) { $this->radius -= 1; } $this->radius = round($this->radius / 2); $this->im = imagecreatetruecolor($this->w, $this->h); #<----CREO L'IMMAGINE PER IL GRAFICO A TORTA $rgb = Color::hex2rgb($this->bg_color); $this->bg_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]); imagefilltoborder($this->im, 1, 1, 1, $this->bg_color); #<---- Creo lo sfondo $rgb = Color::hex2rgb($this->font_color); $this->font_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]); $rgb = Color::hex2rgb($this->axis_color); $this->axis_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]); if (isset($this->legend)) { graidle::legend(); } graidle::title($this->title); spider::drawSpider(); } } } } }
public static function crawl($_pid = NULL, $_rid = NULL, $_url = NULL, $_title = NULL) { ini_get('safe_mode') or set_time_limit(0); $sid = spider::$sid; if ($sid) { $sRs = iDB::row("SELECT * FROM `#iCMS@__spider_url` WHERE `id`='{$sid}' LIMIT 1;"); $title = $sRs->title; $cid = $sRs->cid; $pid = $sRs->pid; $url = $sRs->url; $rid = $sRs->rid; } else { $rid = spider::$rid; $pid = spider::$pid; $title = spider::$title; $url = spider::$url; $_rid === NULL or $rid = $_rid; $_pid === NULL or $pid = $_pid; $_title === NULL or $title = $_title; $_url === NULL or $url = $_url; } if ($pid) { $project = spider::project($pid); $prule_list_url = $project['list_url']; } $ruleA = spider::rule($rid); $rule = $ruleA['rule']; $dataArray = $rule['data']; if ($prule_list_url) { $rule['list_url'] = $prule_list_url; } if (spider::$dataTest) { echo "<b>抓取规则信息</b><pre>"; print_r(iS::escapeStr($ruleA)); print_r(iS::escapeStr($project)); echo "</pre><hr />"; } spider::$curl_proxy = $rule['proxy']; $responses = array(); $html = spiderTools::remote($url); if (empty($html)) { $msg = '错误:001..采集 ' . $url . '文件内容为空!请检查采集规则'; if (spider::$work == 'shell') { echo "{$msg}\n"; return false; } else { iPHP::alert($msg); } } // $http = spider::check_content_code($html); // // if($http['match']==false){ // return false; // } // $content = $http['content']; spider::$allHtml = ""; $rule['__url__'] = spider::$url; $responses['reurl'] = spider::$url; $responses['__title__'] = $title; foreach ((array) $dataArray as $key => $data) { $content_html = $html; $dname = $data['name']; /** * [UNSET:name] * 注销[name] * @var string */ if (strpos($dname, 'UNSET:') !== false) { $_dname = str_replace('UNSET:', '', $dname); unset($responses[$_dname]); continue; } /** * [DATA:name] * 把之前[name]处理完的数据当作原始数据 * 如果之前有数据会叠加 * 用于数据多次处理 * @var string */ if (strpos($dname, 'DATA:') !== false) { $_dname = str_replace('DATA:', '', $dname); $content_html = $responses[$_dname]; unset($responses[$dname]); } /** * [PRE:name] * 把PRE:name采集到的数据 当做原始数据 * 一般用于下载内容 * @var string */ $pre_dname = 'PRE:' . $dname; if (isset($responses[$pre_dname])) { $content_html = $responses[$pre_dname]; unset($responses[$pre_dname]); } /** * [EMPTY:name] * 如果[name]之前抓取结果数据为空使用这个数据项替换 * @var string */ if (strpos($dname, 'EMPTY:') !== false) { $_dname = str_replace('EMPTY:', '', $dname); if (empty($responses[$_dname])) { $dname = $_dname; } else { //有值不执行抓取 continue; } } $content = spiderContent::crawl($content_html, $data, $rule, $responses); unset($content_html); if (strpos($dname, 'ARRAY:') !== false) { // if(strpos($data['rule'], 'RULE@')!==false){ $dname = str_replace('ARRAY:', '', $dname); // $contentArray = $responses[$dname]; // // $contentArray = $responses[$dname]; $cArray = array(); foreach ((array) $content as $k => $value) { foreach ((array) $value as $key => $val) { $cArray[$key][$k] = $val; } } if ($cArray) { $content = $cArray; unset($cArray); } } /** * [name.xxx] * 采集内容做为数组 */ if (strpos($dname, '.') !== false) { $f_key = substr($dname, 0, stripos($dname, ".")); $s_key = substr(strrchr($dname, "."), 1); if (isset($responses[$f_key][$s_key])) { if (is_array($responses[$f_key][$s_key])) { $responses[$f_key][$s_key] = array_merge($responses[$f_key][$s_key], $content); } else { $responses[$f_key][$s_key] .= $content; } } else { $responses[$f_key][$s_key] = $content; } } else { /** * 多个name 内容合并 */ if (isset($responses[$dname])) { if (is_array($responses[$dname])) { $responses[$dname] = array_merge($responses[$dname], $content); } else { $responses[$dname] .= $content; } } else { $responses[$dname] = $content; } } /** * 对匹配多条的数据去重过滤 */ if (!is_array($responses[$dname]) && $data['multi']) { if (strpos($responses[$dname], ',') !== false) { $_dnameArray = explode(',', $responses[$dname]); $dnameArray = array(); foreach ((array) $_dnameArray as $key => $value) { $value = trim($value); $value && ($dnameArray[] = $value); } $dnameArray = array_filter($dnameArray); $dnameArray = array_unique($dnameArray); $responses[$dname] = implode(',', $dnameArray); unset($dnameArray, $_dnameArray); } } gc_collect_cycles(); } if (isset($responses['title']) && empty($responses['title'])) { $responses['title'] = $responses['__title__']; } spider::$allHtml = null; unset($html); gc_collect_cycles(); if (spider::$dataTest) { echo "<pre style='width:99%;word-wrap: break-word;'>"; print_r(iS::escapeStr($responses)); echo '<hr />'; echo '使用内存:' . iFS::sizeUnit(memory_get_usage()) . ' 执行时间:' . iPHP::timer_stop() . 's'; echo "</pre>"; } iFS::$CURLOPT_ENCODING = ''; iFS::$CURLOPT_REFERER = ''; iFS::$watermark_config['pos'] = iCMS::$config['watermark']['pos']; iFS::$watermark_config['x'] = iCMS::$config['watermark']['x']; iFS::$watermark_config['y'] = iCMS::$config['watermark']['y']; iFS::$watermark_config['img'] = iCMS::$config['watermark']['img']; $rule['fs']['encoding'] && (iFS::$CURLOPT_ENCODING = $rule['fs']['encoding']); $rule['fs']['referer'] && (iFS::$CURLOPT_REFERER = $rule['fs']['referer']); if ($rule['watermark_mode']) { iFS::$watermark_config['pos'] = $rule['watermark']['pos']; iFS::$watermark_config['x'] = $rule['watermark']['x']; iFS::$watermark_config['y'] = $rule['watermark']['y']; $rule['watermark']['img'] && (iFS::$watermark_config['img'] = $rule['watermark']['img']); } if (spider::$callback['data'] && is_callable(spider::$callback['data'])) { $responses = call_user_func_array(spider::$callback['data'], array($responses)); } return $responses; }
<?php /** * Spider API to get a list of links from a given URL * @param $url (string) required * @return json data array */ include 'spider_class.php'; $url = isset($_GET['url']) ? strip_tags($_GET['url']) : 'https://news.ycombinator.com'; $crawl = new spider($url); // store response to array $arr = array(); $arr['error'] = $crawl->get_error(); $arr['links'] = $crawl->get_list(); $arr['cinfo'] = $crawl->get_info(); // return json data echo json_encode($arr, true); ?> Enter file contents here
/** * 抓取资源 * @param [string] $html [抓取结果] * @param [array] $data [数据项] * @param [array] $rule [规则] * @param [array] $responses [已经抓取资源] * @return [array] [返回处理结果] */ public static function crawl($html, $data, $rule, $responses) { if (trim($data['rule']) === '') { return; } $name = $data['name']; if (spider::$dataTest) { print_r('<b>[' . $name . ']规则:</b>' . iS::escapeStr($data['rule'])); echo "<hr />"; } if (strpos($data['rule'], 'RULE@') !== false) { spider::$rid = str_replace('RULE@', '', $data['rule']); $_urls = trim($html); if (spider::$dataTest) { print_r('<b>使用[rid:' . spider::$rid . ']规则抓取</b>:' . $_urls); echo "<hr />"; } return spiderUrls::crawl('DATA@RULE', false, spider::$rid, $_urls); } /** * RAND@10,0 * 返回随机数 */ if (strpos($data['rule'], 'RAND@') !== false) { $random = str_replace('RAND@', '', $data['rule']); list($length, $numeric) = explode(',', $random); return random($length, empty($numeric) ? 0 : 1); } $contentArray = array(); $contentHash = array(); $_content = null; $_content = spiderContent::match($html, $data, $rule); $cmd5 = md5($_content); $contentArray[] = $_content; $contentHash[$cmd5] = true; if ($data['page']) { if (empty($rule['page_url'])) { $rule['page_url'] = $rule['list_url']; } if (empty(spider::$allHtml)) { $page_url_array = array(); $page_area_rule = trim($rule['page_area_rule']); if ($page_area_rule) { if (strpos($page_area_rule, 'DOM::') !== false) { iPHP::import(iPHP_LIB . '/phpQuery.php'); $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); $pq_dom = str_replace('DOM::', '', $page_area_rule); $pq_array = phpQuery::pq($pq_dom); foreach ($pq_array as $pn => $pq_val) { $href = phpQuery::pq($pq_val)->attr('href'); if ($href) { if ($rule['page_url_rule']) { if (strpos($rule['page_url_rule'], '<%') !== false) { $page_url_rule = spiderTools::pregTag($rule['page_url_rule']); if (!preg_match('|' . $page_url_rule . '|is', $href)) { continue; } } else { $cleanhref = spiderTools::dataClean($rule['page_url_rule'], $href); if ($cleanhref) { $href = $cleanhref; unset($cleanhref); } else { continue; } } } $href = str_replace('<%url%>', $href, $rule['page_url']); $page_url_array[$pn] = spiderTools::url_complement($rule['__url__'], $href); } } phpQuery::unloadDocuments($doc->getDocumentID()); } else { $page_area_rule = spiderTools::pregTag($page_area_rule); if ($page_area_rule) { preg_match('|' . $page_area_rule . '|is', $html, $matches, $PREG_SET_ORDER); $page_area = $matches['content']; } else { $page_area = $html; } if ($rule['page_url_rule']) { $page_url_rule = spiderTools::pregTag($rule['page_url_rule']); preg_match_all('|' . $page_url_rule . '|is', $page_area, $page_url_matches, PREG_SET_ORDER); foreach ($page_url_matches as $pn => $row) { $href = str_replace('<%url%>', $row['url'], $rule['page_url']); $page_url_array[$pn] = spiderTools::url_complement($rule['__url__'], $href); gc_collect_cycles(); } } unset($page_area); } } else { // 逻辑方式 if ($rule['page_url_parse'] == '<%url%>') { $page_url = str_replace('<%url%>', $rule['__url__'], $rule['page_url']); } else { $page_url_rule = spiderTools::pregTag($rule['page_url_parse']); preg_match('|' . $page_url_rule . '|is', $rule['__url__'], $matches, $PREG_SET_ORDER); $page_url = str_replace('<%url%>', $matches['url'], $rule['page_url']); } if (stripos($page_url, '<%step%>') !== false) { for ($pn = $rule['page_no_start']; $pn <= $rule['page_no_end']; $pn = $pn + $rule['page_no_step']) { $page_url_array[$pn] = str_replace('<%step%>', $pn, $page_url); gc_collect_cycles(); } } } //URL去重清理 if ($page_url_array) { $page_url_array = array_filter($page_url_array); $page_url_array = array_unique($page_url_array); $puk = array_search($rule['__url__'], $page_url_array); if ($puk !== false) { unset($page_url_array[$puk]); } } if (spider::$dataTest) { echo "<b>内容页网址:</b>" . $rule['__url__'] . "<br />"; echo "<b>分页:</b>" . $rule['page_url'] . "<br />"; echo iS::escapeStr($page_url_rule); echo "<hr />"; } if (spider::$dataTest) { echo "<b>分页列表:</b><pre>"; print_r($page_url_array); echo "</pre><hr />"; } spider::$content_right_code = trim($rule['page_url_right']); spider::$content_error_code = trim($rule['page_url_error']); spider::$curl_proxy = $rule['proxy']; $pageurl = array(); foreach ($page_url_array as $pukey => $purl) { //usleep(100); $phtml = spiderTools::remote($purl); if (empty($phtml)) { break; } $md5 = md5($phtml); if ($pageurl[$md5]) { break; } $check_content = spiderTools::check_content_code($phtml); if ($check_content === false) { unset($check_content, $phtml); break; } $_content = spiderContent::match($phtml, $data, $rule); $cmd5 = md5($_content); if ($contentHash[$cmd5]) { break; } $contentArray[] = $_content; $contentHash[$cmd5] = true; $pageurl[$md5] = $purl; spider::$allHtml[$md5] = $phtml; } gc_collect_cycles(); unset($check_content, $phtml); if (spider::$dataTest) { echo "<b>最终分页列表:</b><pre>"; print_r($pageurl); echo "</pre><hr />"; } } else { foreach ((array) spider::$allHtml as $ahkey => $phtml) { $contentArray[] = spiderContent::match($phtml, $data, $rule); } } } $content = implode('#--iCMS.PageBreak--#', $contentArray); $html = null; unset($html, $contentArray, $contentHash, $_content); $content = stripslashes($content); if (spider::$dataTest) { print_r('<b>[' . $name . ']匹配结果:</b>' . htmlspecialchars($content)); echo "<hr />"; } if ($data['cleanbefor']) { $content = spiderTools::dataClean($data['cleanbefor'], $content); } /** * 在数据项里调用之前采集的数据[DATA@name][DATA@name.key] */ if (strpos($content, '[DATA@') !== false) { $content = spiderTools::getDATA($responses, $content); } if ($data['cleanhtml']) { $content = stripslashes($content); $content = preg_replace('/<[\\/\\!]*?[^<>]*?>/is', '', $content); } if ($data['format'] && $content) { $content = autoformat($content); } if ($data['img_absolute'] && $content) { // $content = stripslashes($content); preg_match_all("/<img.*?src\\s*=[\"|'](.*?)[\"|']/is", $content, $img_match); if ($img_match[1]) { $_img_array = array_unique($img_match[1]); $_img_urls = array(); foreach ((array) $_img_array as $_img_key => $_img_src) { $_img_urls[$_img_key] = spiderTools::url_complement($rule['__url__'], $_img_src); } $content = str_replace($_img_array, $_img_urls, $content); } unset($img_match, $_img_array, $_img_urls, $_img_src); } if ($data['trim']) { $content = trim($content); } if ($data['capture']) { // $content = stripslashes($content); $content = spiderTools::remote($content); } if ($data['download']) { // $content = stripslashes($content); $content = iFS::http($content); } if ($data['cleanafter']) { $content = spiderTools::dataClean($data['cleanafter'], $content); // $content = stripslashes($content); } if ($data['autobreakpage']) { $content = spiderTools::autoBreakPage($content); } if ($data['mergepage']) { $content = spiderTools::mergePage($content); } if ($data['empty'] && empty($content)) { $emptyMsg = '[' . $name . ']规则设置了不允许为空.当前抓取结果为空!请检查,规则是否正确!'; if (spider::$dataTest) { exit('<h1>' . $emptyMsg . '</h1>'); } if (spider::$work) { echo "\n{$emptyMsg}\n"; return false; } else { iPHP::alert($emptyMsg); } } if ($data['json_decode']) { $content = json_decode($content, true); } if ($data['array']) { return (array) $content; } if (spider::$callback['content'] && is_callable(spider::$callback['content'])) { $content = call_user_func_array(spider::$callback['content'], array($content)); } return $content; }
?> 快照谨为网络故障时之索引,不代表被搜索网站的即时页面。)</p> <hr style="margin:8px 0;width:100%"> </div> </td></tr> </table> <?php //header("location:".$row["url"].""); $file_name = "www/" . str_replace("http://", "", $url . ".html"); if (file_exists($file_name)) { $htmlcode = file_get_contents($file_name); //echo $file_name; } if (empty($htmlcode)) { $spider = new spider(); $spider->url($url); $htmlcode = $spider->htmlcode; } $htmlcode = replace_html($htmlcode); /* foreach(explode(" ",$wd) as $value) { $htmlcode=str_replace($value,"<font background=#FFFF00>".$value."</font>",$htmlcode); }*/ echo $htmlcode; /* $fp=@fopen("www/".str_replace("http://","",$url.".html"),"w") or die("写方式打开文件失败,请检查程序目录是否为可写");//配置conn.php文件 @fputs($fp,$htmlcode) or die("文件写入失败,请检查程序目录是否为可写"); @fclose($fp); */ function replace_html($string)
function insert_links($url) { global $db, $config; $spider = new spider(); $spider->url($url); $links = $spider->links(); $sites = $spider->sites(); foreach ($sites as $value) { $site_url = GetSiteUrl($link); $site = $db->get_one("select * from ve123_sites where url='" . $site_url . "'"); $site_id = $site["site_id"]; $row = $db->get_one("select * from ve123_links where url='" . $value . "'"); if (empty($row) && is_url($value)) { echo $value . "<br>"; $array = array('url' => $value, 'site_id' => $site_id, 'level' => '0'); $db->insert("ve123_links", $array); } else { echo "已存在:" . $value . "<br>"; } ob_flush(); flush(); //sleep(1); $row = $db->get_one("select * from ve123_sites where url='" . $value . "'"); if (empty($row) && is_url($value)) { $array = array('url' => $value, 'spider_depth' => $config["spider_depth"], 'addtime' => time()); $db->insert("ve123_sites", $array); } } //sleep(1); foreach ($links as $value) { $row = $db->get_one("select * from ve123_links_temp where url='" . $value . "'"); if (empty($row) && is_url($value)) { $array = array('url' => $value); $db->insert("ve123_links_temp", $array); } } }
public static function crawl($work = NULL, $pid = NULL, $_rid = NULL, $_urls = null, $callback = null) { $pid === NULL && ($pid = spider::$pid); if ($pid) { $project = spider::project($pid); $cid = $project['cid']; $rid = $project['rid']; $prule_list_url = $project['list_url']; $lastupdate = $project['lastupdate']; } else { $cid = spider::$cid; $rid = spider::$rid; } if (empty($rid) && $_rid !== NULL) { $rid = $_rid; } if ($work == 'shell') { $lastupdate = $project['lastupdate']; if ($project['psleep']) { if (time() - $lastupdate < $project['psleep']) { echo '采集方案[' . $pid . "]:" . format_date($lastupdate) . "刚采集过了,请" . $project['psleep'] / 3600 . "小时后在继续采集\n"; return; } } echo "[32m开始采集方案[" . $pid . "] 采集规则[" . $rid . "][0m\n"; } $ruleA = spider::rule($rid); $rule = $ruleA['rule']; $urls = $rule['list_urls']; $project['urls'] && ($urls = $project['urls']); spiderUrls::$urls && ($urls = spiderUrls::$urls); $_urls && ($urls = $_urls); $urlsArray = explode("\n", $urls); $urlsArray = array_filter($urlsArray); $_urlsArray = $urlsArray; $urlsList = array(); if ($work == 'shell') { // echo "$urls\n"; print_r($urlsArray); } foreach ($_urlsArray as $_key => $_url) { $_url = htmlspecialchars_decode($_url); $_urlsList = array(); /** * RULE@rid@url * url使用[rid]规则采集并返回列表结果 */ if (strpos($_url, 'RULE@') !== false) { list($___s, $_rid, $_urls) = explode('@', $_url); if (spider::$ruleTest) { print_r('<b>使用[rid:' . $_rid . ']规则抓取列表</b>:' . $_urls); echo "<hr />"; } $_urlsList = spiderUrls::crawl($work, false, $_rid, $_urls, 'CALLBACK@URL'); $urlsList = array_merge($urlsList, $_urlsList); unset($urlsArray[$_key]); } else { preg_match('|.*<(.*)>.*|is', $_url, $_matches); if ($_matches) { list($format, $begin, $num, $step, $zeroize, $reverse) = explode(',', $_matches[1]); $url = str_replace($_matches[1], '*', trim($_matches[0])); $_urlsList = spiderTools::mkurls($url, $format, $begin, $num, $step, $zeroize, $reverse); unset($urlsArray[$_key]); $urlsList = array_merge($urlsList, $_urlsList); } } } $urlsList && ($urlsArray = array_merge($urlsArray, $urlsList)); unset($_urlsArray, $_key, $_url, $_matches, $_urlsList, $urlsList); $urlsArray = array_unique($urlsArray); // spider::$useragent = $rule['user_agent']; // spider::$encoding = $rule['curl']['encoding']; // spider::$referer = $rule['curl']['referer']; // spider::$charset = $rule['charset']; if (empty($urlsArray)) { if ($work == 'shell') { echo "采集列表为空!请填写!\n"; return false; } iPHP::alert('采集列表为空!请填写!', 'js:parent.window.iCMS_MODAL.destroy();'); } // if(spider::$ruleTest){ // echo "<pre>"; // print_r(iS::escapeStr($project)); // print_r(iS::escapeStr($rule)); // echo "</pre>"; // echo "<hr />"; // } if ($rule['mode'] == "2") { iPHP::import(iPHP_LIB . '/phpQuery.php'); spider::$ruleTest && $_GET['pq_debug'] && (phpQuery::$debug = 1); } $pubArray = array(); $pubCount = array(); $pubAllCount = array(); spider::$curl_proxy = $rule['proxy']; spider::$urlslast = null; foreach ($urlsArray as $key => $url) { $url = trim($url); spider::$urlslast = $url; if ($work == 'shell') { echo '开始采集列表:' . $url . "\n"; } if (spider::$ruleTest) { echo '<b>抓取列表:</b>' . $url . "<br />"; } $html = spiderTools::remote($url); if (empty($html)) { continue; } if ($rule['mode'] == "2") { $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); $list_area = $doc[trim($rule['list_area_rule'])]; // if(strpos($rule['list_area_format'], 'DOM::')!==false){ // $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area); // } if ($rule['list_area_format']) { $list_area_format = trim($rule['list_area_format']); if (strpos($list_area_format, 'ARRAY::') !== false) { $list_area_format = str_replace('ARRAY::', '', $list_area_format); $lists = array(); foreach ($list_area as $la_key => $la) { $lists[] = phpQuery::pq($list_area_format, $la); } } else { $lists = phpQuery::pq($list_area_format, $list_area); } } else { $lists = $list_area; } // $lists = $list_area; //echo 'list:getDocumentID:'.$lists->getDocumentID()."\n"; } else { $list_area_rule = spiderTools::pregTag($rule['list_area_rule']); if ($list_area_rule) { preg_match('|' . $list_area_rule . '|is', $html, $matches, $PREG_SET_ORDER); $list_area = $matches['content']; } else { $list_area = $html; } $html = null; unset($html); if (spider::$ruleTest) { echo iS::escapeStr($rule['list_area_rule']); // echo iS::escapeStr($list_area); echo "<hr />"; } if ($rule['list_area_format']) { $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area); } preg_match_all('|' . spiderTools::pregTag($rule['list_url_rule']) . '|is', $list_area, $lists, PREG_SET_ORDER); $list_area = null; unset($list_area); if ($rule['sort'] == "1") { //arsort($lists); } elseif ($rule['sort'] == "2") { asort($lists); } elseif ($rule['sort'] == "3") { shuffle($lists); } } if (spider::$ruleTest) { echo '<b>列表区域规则:</b>' . iS::escapeStr($rule['list_area_rule']); echo "<hr />"; echo '<b>列表区域抓取结果:</b>' . iS::escapeStr($list_area); echo "<hr />"; echo '<b>列表链接规则:</b>' . iS::escapeStr($rule['list_url_rule']); echo "<hr />"; echo '<b>网址合成规则:</b>' . iS::escapeStr($rule['list_url']); echo "<hr />"; } if ($prule_list_url) { $rule['list_url'] = $prule_list_url; } //PID@xx 返回URL列表 if ($callback == 'CALLBACK@URL') { $cbListUrl = array(); foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } // if(spider::checker($work)===true){ $cbListUrl[] = spider::$url; // } } return $cbListUrl; } if ($work == "shell") { $pubCount[$url]['count'] = count($lists); $pubAllCount['count'] += $pubCount[$url]['count']; echo "开始采集:" . $url . " 列表 " . $pubCount[$url]['count'] . "条记录\n"; foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } $hash = md5(spider::$url); echo "title:" . spider::$title . "\n"; echo "url:" . spider::$url . "\n"; spider::$rid = $rid; $checker = spider::checker($work); if ($checker === true) { echo "开始采集...."; $callback = spider::publish("shell"); if ($callback['code'] == "1001") { $pubCount[$url]['success']++; $pubAllCount['success']++; echo "....√\n"; if ($project['sleep']) { echo "sleep:" . $project['sleep'] . "s\n"; if ($rule['mode'] != "2") { unset($lists[$lkey]); } gc_collect_cycles(); sleep($project['sleep']); } else { //sleep(1); } } else { $pubCount[$url]['error']++; $pubAllCount['error']++; echo "error\n\n"; continue; } } $pubCount[$url]['published']++; $pubAllCount['published']++; } if ($rule['mode'] == "2") { phpQuery::unloadDocuments($doc->getDocumentID()); } else { unset($lists); } } if ($work == "WEB@MANUAL") { $listsArray[$url] = $lists; } if ($work == "WEB@AUTO" || $work == 'DATA@RULE') { foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } $hash = md5(spider::$url); if (spider::$ruleTest) { echo '<b>列表抓取结果:</b>' . $lkey . '<br />'; echo spider::$title . ' (<a href="' . APP_URI . '&do=testdata' . '&url=' . urlencode(spider::$url) . '&rid=' . $rid . '&pid=' . $pid . '&title=' . urlencode(spider::$title) . '" target="_blank">测试内容规则</a>) <br />'; echo spider::$url . "<br />"; echo $hash . "<br /><hr />"; } else { if (spider::checker($work) === true || spider::$dataTest) { $suData = array('sid' => 0, 'url' => spider::$url, 'title' => spider::$title, 'cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'hash' => $hash); switch ($work) { case 'DATA@RULE': $contentArray[$lkey] = spiderData::crawl(); // $contentArray[$lkey] = spiderUrls::crawl($work,$_pid); unset($suData['sid']); $suData['title'] = addslashes($suData['title']); $suData += array('addtime' => time(), 'status' => '2', 'publish' => '2', 'indexid' => '0', 'pubdate' => '0'); spider::$dataTest or $suid = iDB::insert('spider_url', $suData); $contentArray[$lkey]['spider_url'] = $suid; break; case 'WEB@AUTO': $pubArray[] = $suData; break; } } } } } } $lists = null; unset($lists); gc_collect_cycles(); switch ($work) { case 'WEB@AUTO': return $pubArray; break; case 'DATA@RULE': return $contentArray; break; case 'WEB@MANUAL': return array('cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'sid' => $sid, 'work' => $work, 'rule' => $rule, 'listsArray' => $listsArray); break; case "shell": echo "采集数据统结果:\n"; print_r($pubCount); print_r($pubAllCount); echo "全部采集完成....\n"; iDB::update('spider_project', array('lastupdate' => time()), array('id' => $pid)); break; } }
if (empty($site)) { $array = array('url' => $url, 'spider_depth' => $config["spider_depth"], 'indexdate' => time(), 'addtime' => time()); $db->insert("kuaso_sites", $array); } $site = $db->get_one("select * from kuaso_sites where url='{$url}'"); if (!empty($site)) { $ip = ip(); //$referer=$_SERVER['HTTP_REFERER']; $v = $db->get_one("select * from kuaso_stat_visitor where v_ip='" . $ip . "' and v_time>='" . (time() - 86400 * 1) . "'"); if (empty($v)) { $array = array('v_time' => time(), 'v_ip' => $ip); $db->insert("kuaso_stat_visitor", $array); $db->query("update kuaso_sites set com_time='" . time() . "',com_count_ip=com_count_ip+1 where url='" . $url . "'"); } } $site = $db->get_one("select * from kuaso_sites where url='{$url}'"); if (!empty($site)) { $row = $db->get_one("select * from kuaso_links where url='" . $url . "'"); if (empty($row)) { $spider = new spider(); $spider->url($url); $title = $spider->title; $fulltxt = $spider->fulltxt(800); $keywords = $spider->keywords; $description = $spider->description; $pagesize = $spider->pagesize; $array = array('url' => $url, 'title' => $title, 'fulltxt' => $fulltxt, 'pagesize' => $pagesize, 'keywords' => $keywords, 'description' => $description, 'updatetime' => time()); $db->insert("kuaso_links", $array); } } }
public static function publish($work = null) { $_POST = spiderData::crawl(); if (spider::$work == 'shell') { if (empty($_POST['title'])) { echo "标题不能为空\n"; return false; } if (empty($_POST['body'])) { echo "内容不能为空\n"; return false; } } $checker = spider::checker($work, spider::$pid, $_POST['reurl'], $_POST['title']); if ($checker !== true) { return $checker; } $project = spider::project(spider::$pid); if (!isset($_POST['cid'])) { $_POST['cid'] = $project['cid']; } $postArgs = spider::postArgs($project['poid']); if ($_GET['indexid']) { $aid = (int) $_GET['indexid']; $_POST['aid'] = $aid; $_POST['adid'] = iDB::value("SELECT `id` FROM `#iCMS@__article_data` WHERE aid='{$aid}'"); } $title = iS::escapeStr($_POST['title']); $url = iS::escapeStr($_POST['reurl']); $hash = md5($url); if (empty(spider::$sid)) { $spider_url = iDB::row("SELECT `id`,`publish`,`indexid` FROM `#iCMS@__spider_url` where `url`='{$url}'", ARRAY_A); if (empty($spider_url)) { $spider_url_data = array('cid' => $project['cid'], 'rid' => spider::$rid, 'pid' => spider::$pid, 'title' => addslashes($title), 'url' => $url, 'hash' => $hash, 'status' => '1', 'addtime' => time(), 'publish' => '0', 'indexid' => '0', 'pubdate' => ''); $suid = iDB::insert('spider_url', $spider_url_data); } else { if ($spider_url['indexid']) { $_POST['aid'] = $spider_url['indexid']; $_POST['adid'] = iDB::value("SELECT `id` FROM `#iCMS@__article_data` WHERE aid='" . $spider_url['indexid'] . "'"); } $suid = $spider_url['id']; } } else { $suid = spider::$sid; } if (spider::$callback['post'] && is_callable(spider::$callback['post'])) { $_POST = call_user_func_array(spider::$callback['post'], array($_POST)); } iS::slashes($_POST); $app = iACP::app($postArgs->app); $fun = $postArgs->fun; $app->callback['code'] = '1001'; /** * 主表 回调 更新关联ID */ $app->callback['primary'] = array(array('spider', 'update_spider_url_indexid'), array('suid' => $suid)); /** * 数据表 回调 成功发布 */ $app->callback['data'] = array(array('spider', 'update_spider_url_publish'), array('suid' => $suid)); $callback = $app->{$fun}(); if ($callback['code'] == $app->callback['code']) { if (spider::$sid) { $work === NULL && iPHP::success("发布成功!", 'js:1'); } else { $work === NULL && iPHP::success("发布成功!", 'js:parent.$("#' . $hash . '").remove();'); } } if ($work == "shell" || $work == "WEB@AUTO") { $callback['work'] = $work; return $callback; } }
function do_addproject() { $rs = array(); $this->pid && ($rs = spider::project($this->pid)); $cid = empty($rs['cid']) ? $this->cid : $rs['cid']; $categoryApp = iACP::app('category', iCMS_APP_ARTICLE); $cata_option = $categoryApp->select(false, $cid); $rule_option = $this->rule_opt($rs['rid']); $post_option = $this->post_opt($rs['poid']); //$rs['sleep'] OR $rs['sleep'] = 30; include iACP::view("spider.addproject"); }
function Update_link($url) { global $db, $bug_url; $is_success = FALSE; $is_shoulu = FALSE; $spider = new spider(); $spider->url($url); $title = $spider->title; $fulltxt = $spider->fulltxt(800); $pagesize = $spider->pagesize; $keywords = $spider->keywords; $htmlcode = $spider->htmlcode; $description = $spider->description; $site_url = GetSiteUrl($url); $site = $db->get_one("select * from ve123_sites where url='" . $site_url . "'"); $site_id = $site["site_id"]; echo $title; $array = array('title' => $title, 'fulltxt' => $fulltxt, 'pagesize' => $pagesize, 'keywords' => $keywords, 'description' => $description, 'site_id' => $site_id); $db->query("update ve123_links set updatetime='" . time() . "' where url='" . $url . "'"); if (!empty($title)) { $s = array(); $s = explode("?", $title); if ($pagesize > 1 && count($s) < 2) { $domain = GetSiteUrl($url); $site = $db->get_one("select * from ve123_sites where url='" . $domain . "'"); if (!empty($site)) { if (!empty($site["include_word"])) { foreach (explode(",", $site["include_word"]) as $value) { if (stristr($htmlcode, $value)) { $include_num += 1; } } if ($include_num <= 0) { $is_shoulu = FALSE; } } else { $is_shoulu = TRUE; } if (!empty($site["not_include_word"])) { foreach (explode(",", $site["not_include_word"]) as $value) { if (stristr($htmlcode, $value)) { $not_include_num += 1; } } if ($not_include_num > 0) { $is_shoulu = FALSE; } } } else { $is_shoulu = TRUE; } if ($is_shoulu) { $db->update("ve123_links", $array, "url='" . $url . "'"); //file_put_contents(PATH."k/www/".str_replace("http://","",$url.".html"),$htmlcode); $is_success = TRUE; } } } if (empty($bug_url)) { exit; } return $is_success; }