Example #1
0
 public function spiderUnitTest()
 {
     $object = new spider();
     $expected = array("5", "7", "Right");
     $actual = $object->spider(20, 20, 4, 10, Left, FLFLFRFFLF);
     $this->assertSame(array_diff($expected, $actual), array_diff($actual, $expected));
 }
Example #2
0
function spiderform_action()
{
    $x = isset($_POST['x']) ? $_POST['x'] : '';
    $y = isset($_POST['y']) ? $_POST['y'] : '';
    $m = isset($_POST['m']) ? $_POST['m'] : '';
    $n = isset($_POST['n']) ? $_POST['n'] : '';
    $path = isset($_POST['path']) ? $_POST['path'] : '';
    $dir_str = isset($_POST['dir']) ? $_POST['dir'] : '';
    $spider = new spider($x, $y, $m, $n, $dir_str, $path);
    $spider->spider($x, $y, $m, $n, $dir_str, $path);
}
 function docaiji()
 {
     set_time_limit(0);
     import('ORG.Util.Spider');
     $islocal = intval($_POST['islocal']);
     $list_url = trim($_POST['url_list']);
     $charset = trim($_POST['charset']);
     $page_url = trim($_POST['page_list']);
     $act = intval($_POST['act']);
     $field = $_POST['field'];
     $field[] = 'typeid';
     $role = $_POST['role'];
     $role[] = $_POST['typeid'];
     $spider = new spider();
     //支持单页或多页采集
     $spider->islocal = $islocal;
     $spider->addStartUrl($list_url);
     $spider->setCharset($charset);
     $spider->addLayer(0, 'list', $page_url);
     for ($i = 0; $i < count($field); $i++) {
         $spider->addField($field[$i], $role[$i]);
     }
     $spider->run();
     $spider->output();
     $file = $_SERVER['DOCUMENT_ROOT'] . '/dami_caiji.sql';
     $spider->saveSql('dami_article', $file, $act);
 }
Example #4
0
function AddAndUpdateUrl($url, $action)
{
    global $db;
    $spider = new spider();
    $spider->url($url);
    $title = $spider->title;
    $fulltxt = $spider->fulltxt(800);
    $keywords = $spider->keywords;
    $description = $spider->description;
    $pagesize = $spider->pagesize;
    $array = array('url' => $url, 'title' => $title, 'fulltxt' => $fulltxt, 'pagesize' => $pagesize, 'keywords' => $keywords, 'description' => $description, 'updatetime' => time());
    if ($action == "add") {
        $db->insert("kuaso_links", $array);
    } elseif ($action == "update") {
        $db->update("kuaso_links", $array, "url='" . $url . "'");
    }
}
Example #5
0
 protected function checkLink($uri, $link)
 {
     $link = spider::absolutePath($link, $uri);
     if ($contents = @file_get_contents($link)) {
         // All ok.
     } else {
         echo "{$uri} => {$link} is a broken link!<br>";
     }
 }
Example #6
0
function add_site($url)
{
    global $db;
    $row = $db->get_one("select * from ve123_links where url='" . $url . "'");
    if (empty($row)) {
        require_once PATH . "include/spider/spider_class.php";
        $spider = new spider();
        $spider->url($url);
        $title = $spider->title;
        $fulltxt = $spider->fulltxt(800);
        $keywords = $spider->keywords;
        $description = $spider->description;
        $pagesize = $spider->pagesize;
        $htmlcode = $spider->htmlcode;
        $array = array("url" => $url, "title" => $title, "fulltxt" => $fulltxt, "pagesize" => $pagesize, "keywords" => $keywords, "description" => $description, "updatetime" => time());
        $db->insert("ve123_links", $array);
    } else {
        $array = array("updatetime" => time());
        $db->update("ve123_links", $array, "url='" . $url . "'");
    }
}
Example #7
0
 function add_url()
 {
     if ($url = $this->input->post('url')) {
         // 抓取页面,分析提取页面的标题
         $sp = new spider();
         $data['url'] = $url;
         $data['fetched_info'] = $sp->fetch_info($url);
         // A PHP Error was encountered
         // Severity: Notice
         // Message: Undefined index: keywords
         // Filename: controllers/user.php
         // Line Number: 57
         // A PHP Error was encountered
         // Severity: Notice
         // Message: Undefined index: description
         // Filename: controllers/user.php
         // Line Number: 58
     }
     $data['title'] = '收藏新网页';
     $data['main_content'] = 'create_mark_form';
     $this->load->view('includes/template', $data);
 }
Example #8
0
 require_once 'modules/threadfinder.php';
 // clean up input
 $errors = array();
 if (preg_match('/^[A-Za-z0-9-_]+$/', $_POST['boardname'])) {
     $boardname = trim($_POST['boardname']);
 } else {
     $errors[] = 'Invalid Board Name';
 }
 if (preg_match('/^[A-Za-z0-9-_]+$/', $_POST['keyword'])) {
     $keyword = trim($_POST['keyword']);
 } else {
     $errors[] = 'Invalid Keyword';
 }
 if (empty($errors)) {
     // initate objects
     $spider = new spider();
     $threadFinder = new threadFinder();
     // parameters
     $board = $boardname;
     $searchkeyword = $keyword;
     // get threads from specified board
     $threadFinder->threads = $spider->getThreads($board);
     $threadsFound = $threadFinder->getBySubject($searchkeyword);
     if ($threadsFound) {
         // processing
         if (count($threadsFound) == 1) {
             // just one, we got what we need!
             $threadID = $threadsFound[0];
             $threadLink = 'http://boards.4chan.org/' . $board . '/thread/' . $threadID;
         } else {
             if (count($threadsFound) > 1) {
Example #9
0
    ?>
</th>
          </tr>
        </thead>
        <tbody class="spider-list" id="spider-list-<?php 
    echo md5($furl);
    ?>
">
    <?php 
    foreach ($lists as $lkey => $row) {
        list($_title, $_url) = spiderTools::title_url($row, $rule, $furl);
        if ($_url === false) {
            continue;
        }
        $hash = md5($_url);
        if (spider::checker($work, $pid, $_url, $_title) === true) {
            ?>
          <tr id="<?php 
            echo $hash;
            ?>
">
            <td><input type="checkbox" name="pub[]" value="<?php 
            echo $cid;
            ?>
|<?php 
            echo $pid;
            ?>
|<?php 
            echo $rid;
            ?>
|<?php 
Example #10
0
    ?>
</th>
          </tr>
        </thead>
        <tbody class="spider-list" id="spider-list-<?php 
    echo md5($furl);
    ?>
">
    <?php 
    foreach ($lists as $lkey => $row) {
        list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $furl);
        if (spider::$url === false) {
            continue;
        }
        $hash = md5(spider::$url);
        if (spider::checker($work) === true) {
            ?>
          <tr id="<?php 
            echo $hash;
            ?>
">
            <td><input type="checkbox" name="pub[]" value="<?php 
            echo $cid;
            ?>
|<?php 
            echo $pid;
            ?>
|<?php 
            echo $rid;
            ?>
|<?php 
 /**
  * Checks that the start or end html is included within the main container div
  * @param string $input The value for URL set by the user
  * @return 
  */
 public function check_end_html_included($input)
 {
     $spider = new spider($this->domain, $this->URL);
     $HTML_block = $spider->get_main_html_block($this->HTML, get_option('mainHTMLBlock'), $this->domain . $this->URL);
     if ($input != '') {
         if (strpos($HTML_block, $input) === false) {
             add_settings_error('endHTML', esc_attr('settings_updated'), 'Unable to find the end HTML with the main HTML block', 'error');
         } else {
             add_settings_error('endHTML', 'settings_updated', 'Found within the main HTML block', 'updated');
         }
     }
     return apply_filters('check_html_included', $input, $input);
 }
Example #12
0
 public static function proxy_test()
 {
     $options = array(CURLOPT_URL => 'http://www.baidu.com', CURLOPT_REFERER => 'http://www.baidu.com', CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)', CURLOPT_TIMEOUT => 10, CURLOPT_CONNECTTIMEOUT => 8, CURLOPT_RETURNTRANSFER => 1, CURLOPT_HEADER => 0, CURLOPT_NOSIGNAL => true, CURLOPT_DNS_USE_GLOBAL_CACHE => true, CURLOPT_DNS_CACHE_TIMEOUT => 86400, CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false);
     if (empty(spider::$proxy_array)) {
         if (empty(spider::$curl_proxy)) {
             return false;
         }
         spider::$proxy_array = explode("\n", spider::$curl_proxy);
         // socks5://127.0.0.1:1080@username:password
     }
     if (empty(spider::$proxy_array)) {
         return false;
     }
     $rand_keys = array_rand(spider::$proxy_array, 1);
     $proxy = spider::$proxy_array[$rand_keys];
     $proxy = trim($proxy);
     $options = spiderTools::proxy($options, $proxy);
     $ch = curl_init();
     curl_setopt_array($ch, $options);
     curl_exec($ch);
     $info = curl_getinfo($ch);
     curl_close($ch);
     if ($info['http_code'] == 200) {
         return $proxy;
     } else {
         unset(spider::$proxy_array[$rand_keys]);
         return spiderTools::proxy_test();
     }
 }
Example #13
0
 function create()
 {
     if (in_array("b", $this->type) || in_array("l", $this->type)) {
         for ($bar = $i = 0; $i < count($this->type); $i++) {
             if ($this->type[$i] == 'b') {
                 $bar += 1;
             }
         }
         $this->disbar = $this->larg * $bar;
         $this->ld = $this->larg + $this->disbar;
         # variabile di comodo #
         if (in_array("l", $this->type) && $this->disbar == 0) {
             $this->disbar = 2 * $this->larg;
             $this->ld = $this->disbar;
             # variabile di comodo #
         }
         if (!isset($this->mass)) {
             $this->mass = $this->mx;
         }
         if (!isset($this->mnvs)) {
             $this->mnvs = $this->mn;
         }
         if (isset($this->name)) {
             graidle::setLegend($this->name);
         }
         if (!isset($this->dvx)) {
             if ($this->mass <= 1) {
                 $this->dvx = round($this->mass / 5, 1);
             } else {
                 if ($this->mass > 1 && $this->mass < 10) {
                     $this->dvx = 1;
                 } else {
                     $this->dvx = round($this->mass / 10);
                 }
             }
         }
         if (!isset($this->AA)) {
             $this->AA = 2;
         }
         if ($this->mx > 0) {
             if ($this->mass == $this->mx) {
                 $this->scarmax = 1;
             } else {
                 $this->scarmax = $this->mass - $this->mx;
             }
         }
         $this->scarmin = $this->mn;
         if ($this->mn < 0) {
             if ($this->mnvs > 0 || !isset($this->mnvs)) {
                 $this->scarmin = 0;
             } else {
                 if ($this->mnvs > $this->mn || $this->mnvs < $this->mn) {
                     $this->scarmin = $this->mnvs - $this->mn;
                 } else {
                     $this->scarmin = -1;
                 }
             }
         }
         if (strlen($this->mn) > strlen($this->mx)) {
             $this->y_flag = strlen($this->mn);
         } else {
             $this->y_flag = strlen($this->mx);
         }
         $this->s += $this->font_small * graidle::stringLen($this->mass);
         if (!isset($this->w)) {
             $this->w = $this->ld * $this->cnt + $this->s + $this->d;
             if ($this->w < 640) {
                 while ($this->w < 640) {
                     $this->larg += 0.01;
                     $this->disbar = $this->larg * $bar;
                     $this->ld = $this->larg + $this->disbar;
                     $this->w = round($this->ld * $this->cnt) + $this->s + $this->d;
                 }
             } else {
                 while ($this->w > 641) {
                     $this->larg -= 0.01;
                     $this->disbar = $this->larg * $bar;
                     $this->ld = $this->larg + $this->disbar;
                     $this->w = $this->ld * $this->cnt + $this->s + $this->d;
                 }
             }
         } else {
             while ($this->ld * $this->cnt + $this->s + $this->d >= $this->w) {
                 $this->larg -= 0.01;
                 $this->disbar = $this->larg;
                 $this->ld = $this->larg + $this->disbar;
             }
             while ($this->ld * $this->cnt + $this->s + $this->d <= $this->w) {
                 $this->larg += 0.01;
                 $this->disbar = $this->larg;
                 $this->ld = $this->larg + $this->disbar;
             }
         }
         if (!isset($this->h)) {
             $this->h = round(3 / 4 * $this->w);
         }
         $this->b += 2 * $this->font_small;
         if ($this->mnvs > 0 && $this->mass > 0) {
             $this->mul = ($this->h - $this->a - $this->b) / ($this->mass - $this->mnvs);
         } else {
             $this->mul = ($this->h - $this->a - $this->b) / ($this->mass + $this->scarmax + (abs($this->mn) - $this->scarmin));
         }
         $this->div = $this->dvx * $this->mul;
         $this->im = imagecreatetruecolor($this->w, $this->h);
         $rgb = Color::hex2rgb($this->axis_color);
         $this->axis_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]);
         $rgb = Color::hex2rgb($this->font_color);
         $this->font_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]);
         $rgb = Color::hex2rgb($this->bg_color);
         $this->bg_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]);
         imagefilltoborder($this->im, 1, 1, 1, $this->bg_color);
         if (isset($this->legend) || isset($this->name)) {
             graidle::legend();
         }
         graidle::title($this->title, $this->xAxis, $this->yAxis);
         graidle::gradAxis($this->sx, $this->sy);
         if (in_array("b", $this->type)) {
             include "graidle_histo.ext.php";
             histogram::drawHisto();
         }
         graidle::drawAxis();
         if (in_array("l", $this->type)) {
             include "graidle_line.ext.php";
             line::drawLine();
         }
     } else {
         if (in_array("hb", $this->type)) {
             for ($bar = $i = 0; $i < count($this->type); $i++) {
                 if ($this->type[$i] == 'hb') {
                     $bar += 1;
                 }
             }
             $this->disbar = $this->larg * $bar;
             if (isset($this->name)) {
                 graidle::setLegend($this->name);
             }
             if (!isset($this->mass)) {
                 $this->mass = $this->mx;
             }
             if (!isset($this->mnvs)) {
                 $this->mnvs = $this->mn;
             }
             if (!isset($this->dvx)) {
                 if ($this->mass <= 1) {
                     $this->dvx = round($this->mass / 5, 1);
                 } else {
                     if ($this->mass > 1 && $this->mass < 10) {
                         $this->dvx = 1;
                     } else {
                         $this->dvx = round($this->mass / 10);
                     }
                 }
             }
             if (!isset($this->AA)) {
                 $this->AA = 4;
             }
             $this->b += 5 * $this->font_small;
             $this->d += round(graidle::StringLen($this->mass) * ($this->font_small / 4));
             if (isset($this->vlx)) {
                 for ($maxlen = $i = 0; $i <= count($this->vlx); $i++) {
                     if (isset($this->vlx[$i])) {
                         $curlen = graidle::stringlen($this->vlx[$i]) * $this->font_small;
                         if ($maxlen < $curlen) {
                             $maxlen = $curlen;
                         }
                     }
                 }
                 $this->s += $maxlen + 10;
             } else {
                 $this->s += $this->font_small * 4;
             }
             if (isset($this->yAxis)) {
                 $this->s += 2 * $this->fontsmall;
             }
             $this->ld = $this->larg + $this->disbar;
             # variabile di comodo #
             if (!isset($this->h)) {
                 $this->h = $this->ld * $this->cnt + $this->a + $this->b;
                 if ($this->h < 500) {
                     while ($this->h < 500) {
                         $this->larg += 0.01;
                         $this->disbar = $this->larg * $bar;
                         $this->ld = $this->larg + $this->disbar;
                         $this->h = round($this->ld * $this->cnt) + $this->a + $this->b;
                     }
                 } else {
                     while ($this->h > 501) {
                         $this->larg -= 0.01;
                         $this->disbar = $this->larg * $bar;
                         $this->ld = $this->larg + $this->disbar;
                         $this->h = $this->ld * $this->cnt + $this->a + $this->b;
                     }
                 }
             } else {
                 while ($this->ld * $this->cnt + $this->a + $this->b <= $this->h) {
                     $this->larg += 0.01;
                     $this->disbar = $this->larg * $bar;
                     $this->ld = $this->larg + $this->disbar;
                 }
                 while ($this->ld * $this->cnt + $this->a + $this->b >= $this->h) {
                     $this->larg -= 0.01;
                     $this->disbar = $this->larg * $bar;
                     $this->ld = $this->larg + $this->disbar;
                 }
             }
             if (!isset($this->w)) {
                 $this->w = round(4 / 5 * $this->h);
             }
             if ($this->mnvs > 0 && $this->mass > 0) {
                 $this->mul = ($this->w - $this->s - $this->d) / ($this->mass - $this->mnvs);
             } else {
                 $this->mul = ($this->w - $this->s - $this->d) / ($this->mass + abs($this->mnvs));
             }
             $this->im = imagecreatetruecolor($this->w, $this->h);
             $rgb = Color::hex2rgb($this->axis_color);
             $this->axis_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]);
             $rgb = Color::hex2rgb($this->font_color);
             $this->font_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]);
             $rgb = Color::hex2rgb($this->bg_color);
             $this->bg_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]);
             imagefilltoborder($this->im, 1, 1, 1, $this->bg_color);
             if (isset($this->legend) || isset($this->name)) {
                 graidle::legend();
             }
             include "graidle_horizhisto.ext.php";
             HorizHistogram::gradAxis($this->sx, $this->sy);
             HorizHistogram::drawHorizHisto();
             HorizHistogram::drawAxis();
             graidle::title($this->title, $this->xAxis, $this->yAxis);
         } else {
             if (in_array("p", $this->type)) {
                 include "graidle_pie.ext.php";
                 for ($this->pie = $i = 0; $i < count($this->type); $i++) {
                     if ($this->type[$i] == 'p') {
                         $this->pie += 1;
                     }
                 }
                 if (!isset($this->incl)) {
                     $this->incl = 55;
                 }
                 if (!isset($this->AA)) {
                     $this->AA = 4;
                 }
                 if (!isset($this->w)) {
                     $this->w = 500;
                 }
                 if (!isset($this->h)) {
                     $this->h = 500;
                 }
                 $this->tre_d = 0;
                 if ($this->incl < 90) {
                     $this->tre_d = round($this->incl / 5);
                 }
                 $this->radius = $this->w;
                 $e = sin(deg2rad($this->incl));
                 $rapp = pow($e, 2);
                 $a = $this->radius;
                 $b = $a * $rapp;
                 while ($a >= $this->w - $this->s - $this->d) {
                     $a -= 1;
                     $this->radius = $a;
                     $b = $a * $rapp;
                 }
                 while ($b * $this->pie > $this->h - $this->a - $this->pie * $this->b - $this->pie * $this->tre_d) {
                     $b -= 1;
                     $a = $b / $rapp;
                     $this->radius = $a;
                 }
                 $this->im = imagecreatetruecolor($this->w, $this->h);
                 #<----CREO L'IMMAGINE PER IL GRAFICO A TORTA
                 $rgb = Color::hex2rgb($this->bg_color);
                 $this->bg_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]);
                 imagefilltoborder($this->im, 1, 1, 1, $this->bg_color);
                 #<---- Creo lo sfondo
                 $rgb = Color::hex2rgb($this->font_color);
                 $this->font_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]);
                 if (isset($this->legend)) {
                     graidle::legend();
                 }
                 graidle::title($this->title);
                 pie::drawPie($a, $b);
             } else {
                 if (in_array("s", $this->type)) {
                     include "graidle_spider.ext.php";
                     if (!isset($this->mass)) {
                         $this->mass = $this->mx;
                     }
                     if (!isset($this->filled)) {
                         $this->filled = 1;
                     }
                     if (!isset($this->AA)) {
                         $this->AA = 4;
                     }
                     if (!isset($this->w)) {
                         if (isset($this->h)) {
                             $this->w = round($this->h * (5 / 4));
                         } else {
                             $this->w = 500;
                         }
                     }
                     if (!isset($this->h)) {
                         $this->h = round($this->w * (4 / 5));
                     }
                     if (isset($this->name)) {
                         graidle::setLegend($this->name);
                     }
                     if (!isset($this->dvx)) {
                         if ($this->mass / 10 < 1) {
                             $this->dvx = round($this->mass / 5, 1);
                         } else {
                             $this->dvx = round($this->mass / 10);
                         }
                     }
                     $this->radius = $this->w - $this->s - $this->d;
                     while ($this->radius >= $this->h - $this->a - $this->b) {
                         $this->radius -= 1;
                     }
                     $this->radius = round($this->radius / 2);
                     $this->im = imagecreatetruecolor($this->w, $this->h);
                     #<----CREO L'IMMAGINE PER IL GRAFICO A TORTA
                     $rgb = Color::hex2rgb($this->bg_color);
                     $this->bg_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]);
                     imagefilltoborder($this->im, 1, 1, 1, $this->bg_color);
                     #<---- Creo lo sfondo
                     $rgb = Color::hex2rgb($this->font_color);
                     $this->font_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]);
                     $rgb = Color::hex2rgb($this->axis_color);
                     $this->axis_color = imagecolorallocate($this->im, $rgb[0], $rgb[1], $rgb[2]);
                     if (isset($this->legend)) {
                         graidle::legend();
                     }
                     graidle::title($this->title);
                     spider::drawSpider();
                 }
             }
         }
     }
 }
Example #14
0
 public static function crawl($_pid = NULL, $_rid = NULL, $_url = NULL, $_title = NULL)
 {
     ini_get('safe_mode') or set_time_limit(0);
     $sid = spider::$sid;
     if ($sid) {
         $sRs = iDB::row("SELECT * FROM `#iCMS@__spider_url` WHERE `id`='{$sid}' LIMIT 1;");
         $title = $sRs->title;
         $cid = $sRs->cid;
         $pid = $sRs->pid;
         $url = $sRs->url;
         $rid = $sRs->rid;
     } else {
         $rid = spider::$rid;
         $pid = spider::$pid;
         $title = spider::$title;
         $url = spider::$url;
         $_rid === NULL or $rid = $_rid;
         $_pid === NULL or $pid = $_pid;
         $_title === NULL or $title = $_title;
         $_url === NULL or $url = $_url;
     }
     if ($pid) {
         $project = spider::project($pid);
         $prule_list_url = $project['list_url'];
     }
     $ruleA = spider::rule($rid);
     $rule = $ruleA['rule'];
     $dataArray = $rule['data'];
     if ($prule_list_url) {
         $rule['list_url'] = $prule_list_url;
     }
     if (spider::$dataTest) {
         echo "<b>抓取规则信息</b><pre>";
         print_r(iS::escapeStr($ruleA));
         print_r(iS::escapeStr($project));
         echo "</pre><hr />";
     }
     spider::$curl_proxy = $rule['proxy'];
     $responses = array();
     $html = spiderTools::remote($url);
     if (empty($html)) {
         $msg = '错误:001..采集 ' . $url . '文件内容为空!请检查采集规则';
         if (spider::$work == 'shell') {
             echo "{$msg}\n";
             return false;
         } else {
             iPHP::alert($msg);
         }
     }
     //      $http   = spider::check_content_code($html);
     //
     //      if($http['match']==false){
     //          return false;
     //      }
     //      $content        = $http['content'];
     spider::$allHtml = "";
     $rule['__url__'] = spider::$url;
     $responses['reurl'] = spider::$url;
     $responses['__title__'] = $title;
     foreach ((array) $dataArray as $key => $data) {
         $content_html = $html;
         $dname = $data['name'];
         /**
          * [UNSET:name]
          * 注销[name]
          * @var string
          */
         if (strpos($dname, 'UNSET:') !== false) {
             $_dname = str_replace('UNSET:', '', $dname);
             unset($responses[$_dname]);
             continue;
         }
         /**
          * [DATA:name]
          * 把之前[name]处理完的数据当作原始数据
          * 如果之前有数据会叠加
          * 用于数据多次处理
          * @var string
          */
         if (strpos($dname, 'DATA:') !== false) {
             $_dname = str_replace('DATA:', '', $dname);
             $content_html = $responses[$_dname];
             unset($responses[$dname]);
         }
         /**
          * [PRE:name]
          * 把PRE:name采集到的数据 当做原始数据
          * 一般用于下载内容
          * @var string
          */
         $pre_dname = 'PRE:' . $dname;
         if (isset($responses[$pre_dname])) {
             $content_html = $responses[$pre_dname];
             unset($responses[$pre_dname]);
         }
         /**
          * [EMPTY:name]
          * 如果[name]之前抓取结果数据为空使用这个数据项替换
          * @var string
          */
         if (strpos($dname, 'EMPTY:') !== false) {
             $_dname = str_replace('EMPTY:', '', $dname);
             if (empty($responses[$_dname])) {
                 $dname = $_dname;
             } else {
                 //有值不执行抓取
                 continue;
             }
         }
         $content = spiderContent::crawl($content_html, $data, $rule, $responses);
         unset($content_html);
         if (strpos($dname, 'ARRAY:') !== false) {
             // if(strpos($data['rule'], 'RULE@')!==false){
             $dname = str_replace('ARRAY:', '', $dname);
             // $contentArray = $responses[$dname];
             // // $contentArray = $responses[$dname];
             $cArray = array();
             foreach ((array) $content as $k => $value) {
                 foreach ((array) $value as $key => $val) {
                     $cArray[$key][$k] = $val;
                 }
             }
             if ($cArray) {
                 $content = $cArray;
                 unset($cArray);
             }
         }
         /**
          * [name.xxx]
          * 采集内容做为数组
          */
         if (strpos($dname, '.') !== false) {
             $f_key = substr($dname, 0, stripos($dname, "."));
             $s_key = substr(strrchr($dname, "."), 1);
             if (isset($responses[$f_key][$s_key])) {
                 if (is_array($responses[$f_key][$s_key])) {
                     $responses[$f_key][$s_key] = array_merge($responses[$f_key][$s_key], $content);
                 } else {
                     $responses[$f_key][$s_key] .= $content;
                 }
             } else {
                 $responses[$f_key][$s_key] = $content;
             }
         } else {
             /**
              * 多个name 内容合并
              */
             if (isset($responses[$dname])) {
                 if (is_array($responses[$dname])) {
                     $responses[$dname] = array_merge($responses[$dname], $content);
                 } else {
                     $responses[$dname] .= $content;
                 }
             } else {
                 $responses[$dname] = $content;
             }
         }
         /**
          * 对匹配多条的数据去重过滤
          */
         if (!is_array($responses[$dname]) && $data['multi']) {
             if (strpos($responses[$dname], ',') !== false) {
                 $_dnameArray = explode(',', $responses[$dname]);
                 $dnameArray = array();
                 foreach ((array) $_dnameArray as $key => $value) {
                     $value = trim($value);
                     $value && ($dnameArray[] = $value);
                 }
                 $dnameArray = array_filter($dnameArray);
                 $dnameArray = array_unique($dnameArray);
                 $responses[$dname] = implode(',', $dnameArray);
                 unset($dnameArray, $_dnameArray);
             }
         }
         gc_collect_cycles();
     }
     if (isset($responses['title']) && empty($responses['title'])) {
         $responses['title'] = $responses['__title__'];
     }
     spider::$allHtml = null;
     unset($html);
     gc_collect_cycles();
     if (spider::$dataTest) {
         echo "<pre style='width:99%;word-wrap: break-word;'>";
         print_r(iS::escapeStr($responses));
         echo '<hr />';
         echo '使用内存:' . iFS::sizeUnit(memory_get_usage()) . ' 执行时间:' . iPHP::timer_stop() . 's';
         echo "</pre>";
     }
     iFS::$CURLOPT_ENCODING = '';
     iFS::$CURLOPT_REFERER = '';
     iFS::$watermark_config['pos'] = iCMS::$config['watermark']['pos'];
     iFS::$watermark_config['x'] = iCMS::$config['watermark']['x'];
     iFS::$watermark_config['y'] = iCMS::$config['watermark']['y'];
     iFS::$watermark_config['img'] = iCMS::$config['watermark']['img'];
     $rule['fs']['encoding'] && (iFS::$CURLOPT_ENCODING = $rule['fs']['encoding']);
     $rule['fs']['referer'] && (iFS::$CURLOPT_REFERER = $rule['fs']['referer']);
     if ($rule['watermark_mode']) {
         iFS::$watermark_config['pos'] = $rule['watermark']['pos'];
         iFS::$watermark_config['x'] = $rule['watermark']['x'];
         iFS::$watermark_config['y'] = $rule['watermark']['y'];
         $rule['watermark']['img'] && (iFS::$watermark_config['img'] = $rule['watermark']['img']);
     }
     if (spider::$callback['data'] && is_callable(spider::$callback['data'])) {
         $responses = call_user_func_array(spider::$callback['data'], array($responses));
     }
     return $responses;
 }
<?php

/**
 * Spider API to get a list of links from a given URL
 * @param $url (string) required
 * @return json data array
 */
include 'spider_class.php';
$url = isset($_GET['url']) ? strip_tags($_GET['url']) : 'https://news.ycombinator.com';
$crawl = new spider($url);
// store response to array
$arr = array();
$arr['error'] = $crawl->get_error();
$arr['links'] = $crawl->get_list();
$arr['cinfo'] = $crawl->get_info();
// return json data
echo json_encode($arr, true);
?>
Enter file contents here
Example #16
0
 /**
  * 抓取资源
  * @param  [string] $html      [抓取结果]
  * @param  [array] $data      [数据项]
  * @param  [array] $rule      [规则]
  * @param  [array] $responses [已经抓取资源]
  * @return [array]           [返回处理结果]
  */
 public static function crawl($html, $data, $rule, $responses)
 {
     if (trim($data['rule']) === '') {
         return;
     }
     $name = $data['name'];
     if (spider::$dataTest) {
         print_r('<b>[' . $name . ']规则:</b>' . iS::escapeStr($data['rule']));
         echo "<hr />";
     }
     if (strpos($data['rule'], 'RULE@') !== false) {
         spider::$rid = str_replace('RULE@', '', $data['rule']);
         $_urls = trim($html);
         if (spider::$dataTest) {
             print_r('<b>使用[rid:' . spider::$rid . ']规则抓取</b>:' . $_urls);
             echo "<hr />";
         }
         return spiderUrls::crawl('DATA@RULE', false, spider::$rid, $_urls);
     }
     /**
      * RAND@10,0
      * 返回随机数
      */
     if (strpos($data['rule'], 'RAND@') !== false) {
         $random = str_replace('RAND@', '', $data['rule']);
         list($length, $numeric) = explode(',', $random);
         return random($length, empty($numeric) ? 0 : 1);
     }
     $contentArray = array();
     $contentHash = array();
     $_content = null;
     $_content = spiderContent::match($html, $data, $rule);
     $cmd5 = md5($_content);
     $contentArray[] = $_content;
     $contentHash[$cmd5] = true;
     if ($data['page']) {
         if (empty($rule['page_url'])) {
             $rule['page_url'] = $rule['list_url'];
         }
         if (empty(spider::$allHtml)) {
             $page_url_array = array();
             $page_area_rule = trim($rule['page_area_rule']);
             if ($page_area_rule) {
                 if (strpos($page_area_rule, 'DOM::') !== false) {
                     iPHP::import(iPHP_LIB . '/phpQuery.php');
                     $doc = phpQuery::newDocumentHTML($html, 'UTF-8');
                     $pq_dom = str_replace('DOM::', '', $page_area_rule);
                     $pq_array = phpQuery::pq($pq_dom);
                     foreach ($pq_array as $pn => $pq_val) {
                         $href = phpQuery::pq($pq_val)->attr('href');
                         if ($href) {
                             if ($rule['page_url_rule']) {
                                 if (strpos($rule['page_url_rule'], '<%') !== false) {
                                     $page_url_rule = spiderTools::pregTag($rule['page_url_rule']);
                                     if (!preg_match('|' . $page_url_rule . '|is', $href)) {
                                         continue;
                                     }
                                 } else {
                                     $cleanhref = spiderTools::dataClean($rule['page_url_rule'], $href);
                                     if ($cleanhref) {
                                         $href = $cleanhref;
                                         unset($cleanhref);
                                     } else {
                                         continue;
                                     }
                                 }
                             }
                             $href = str_replace('<%url%>', $href, $rule['page_url']);
                             $page_url_array[$pn] = spiderTools::url_complement($rule['__url__'], $href);
                         }
                     }
                     phpQuery::unloadDocuments($doc->getDocumentID());
                 } else {
                     $page_area_rule = spiderTools::pregTag($page_area_rule);
                     if ($page_area_rule) {
                         preg_match('|' . $page_area_rule . '|is', $html, $matches, $PREG_SET_ORDER);
                         $page_area = $matches['content'];
                     } else {
                         $page_area = $html;
                     }
                     if ($rule['page_url_rule']) {
                         $page_url_rule = spiderTools::pregTag($rule['page_url_rule']);
                         preg_match_all('|' . $page_url_rule . '|is', $page_area, $page_url_matches, PREG_SET_ORDER);
                         foreach ($page_url_matches as $pn => $row) {
                             $href = str_replace('<%url%>', $row['url'], $rule['page_url']);
                             $page_url_array[$pn] = spiderTools::url_complement($rule['__url__'], $href);
                             gc_collect_cycles();
                         }
                     }
                     unset($page_area);
                 }
             } else {
                 // 逻辑方式
                 if ($rule['page_url_parse'] == '<%url%>') {
                     $page_url = str_replace('<%url%>', $rule['__url__'], $rule['page_url']);
                 } else {
                     $page_url_rule = spiderTools::pregTag($rule['page_url_parse']);
                     preg_match('|' . $page_url_rule . '|is', $rule['__url__'], $matches, $PREG_SET_ORDER);
                     $page_url = str_replace('<%url%>', $matches['url'], $rule['page_url']);
                 }
                 if (stripos($page_url, '<%step%>') !== false) {
                     for ($pn = $rule['page_no_start']; $pn <= $rule['page_no_end']; $pn = $pn + $rule['page_no_step']) {
                         $page_url_array[$pn] = str_replace('<%step%>', $pn, $page_url);
                         gc_collect_cycles();
                     }
                 }
             }
             //URL去重清理
             if ($page_url_array) {
                 $page_url_array = array_filter($page_url_array);
                 $page_url_array = array_unique($page_url_array);
                 $puk = array_search($rule['__url__'], $page_url_array);
                 if ($puk !== false) {
                     unset($page_url_array[$puk]);
                 }
             }
             if (spider::$dataTest) {
                 echo "<b>内容页网址:</b>" . $rule['__url__'] . "<br />";
                 echo "<b>分页:</b>" . $rule['page_url'] . "<br />";
                 echo iS::escapeStr($page_url_rule);
                 echo "<hr />";
             }
             if (spider::$dataTest) {
                 echo "<b>分页列表:</b><pre>";
                 print_r($page_url_array);
                 echo "</pre><hr />";
             }
             spider::$content_right_code = trim($rule['page_url_right']);
             spider::$content_error_code = trim($rule['page_url_error']);
             spider::$curl_proxy = $rule['proxy'];
             $pageurl = array();
             foreach ($page_url_array as $pukey => $purl) {
                 //usleep(100);
                 $phtml = spiderTools::remote($purl);
                 if (empty($phtml)) {
                     break;
                 }
                 $md5 = md5($phtml);
                 if ($pageurl[$md5]) {
                     break;
                 }
                 $check_content = spiderTools::check_content_code($phtml);
                 if ($check_content === false) {
                     unset($check_content, $phtml);
                     break;
                 }
                 $_content = spiderContent::match($phtml, $data, $rule);
                 $cmd5 = md5($_content);
                 if ($contentHash[$cmd5]) {
                     break;
                 }
                 $contentArray[] = $_content;
                 $contentHash[$cmd5] = true;
                 $pageurl[$md5] = $purl;
                 spider::$allHtml[$md5] = $phtml;
             }
             gc_collect_cycles();
             unset($check_content, $phtml);
             if (spider::$dataTest) {
                 echo "<b>最终分页列表:</b><pre>";
                 print_r($pageurl);
                 echo "</pre><hr />";
             }
         } else {
             foreach ((array) spider::$allHtml as $ahkey => $phtml) {
                 $contentArray[] = spiderContent::match($phtml, $data, $rule);
             }
         }
     }
     $content = implode('#--iCMS.PageBreak--#', $contentArray);
     $html = null;
     unset($html, $contentArray, $contentHash, $_content);
     $content = stripslashes($content);
     if (spider::$dataTest) {
         print_r('<b>[' . $name . ']匹配结果:</b>' . htmlspecialchars($content));
         echo "<hr />";
     }
     if ($data['cleanbefor']) {
         $content = spiderTools::dataClean($data['cleanbefor'], $content);
     }
     /**
      * 在数据项里调用之前采集的数据[DATA@name][DATA@name.key]
      */
     if (strpos($content, '[DATA@') !== false) {
         $content = spiderTools::getDATA($responses, $content);
     }
     if ($data['cleanhtml']) {
         $content = stripslashes($content);
         $content = preg_replace('/<[\\/\\!]*?[^<>]*?>/is', '', $content);
     }
     if ($data['format'] && $content) {
         $content = autoformat($content);
     }
     if ($data['img_absolute'] && $content) {
         // $content = stripslashes($content);
         preg_match_all("/<img.*?src\\s*=[\"|'](.*?)[\"|']/is", $content, $img_match);
         if ($img_match[1]) {
             $_img_array = array_unique($img_match[1]);
             $_img_urls = array();
             foreach ((array) $_img_array as $_img_key => $_img_src) {
                 $_img_urls[$_img_key] = spiderTools::url_complement($rule['__url__'], $_img_src);
             }
             $content = str_replace($_img_array, $_img_urls, $content);
         }
         unset($img_match, $_img_array, $_img_urls, $_img_src);
     }
     if ($data['trim']) {
         $content = trim($content);
     }
     if ($data['capture']) {
         // $content = stripslashes($content);
         $content = spiderTools::remote($content);
     }
     if ($data['download']) {
         // $content = stripslashes($content);
         $content = iFS::http($content);
     }
     if ($data['cleanafter']) {
         $content = spiderTools::dataClean($data['cleanafter'], $content);
         // $content = stripslashes($content);
     }
     if ($data['autobreakpage']) {
         $content = spiderTools::autoBreakPage($content);
     }
     if ($data['mergepage']) {
         $content = spiderTools::mergePage($content);
     }
     if ($data['empty'] && empty($content)) {
         $emptyMsg = '[' . $name . ']规则设置了不允许为空.当前抓取结果为空!请检查,规则是否正确!';
         if (spider::$dataTest) {
             exit('<h1>' . $emptyMsg . '</h1>');
         }
         if (spider::$work) {
             echo "\n{$emptyMsg}\n";
             return false;
         } else {
             iPHP::alert($emptyMsg);
         }
     }
     if ($data['json_decode']) {
         $content = json_decode($content, true);
     }
     if ($data['array']) {
         return (array) $content;
     }
     if (spider::$callback['content'] && is_callable(spider::$callback['content'])) {
         $content = call_user_func_array(spider::$callback['content'], array($content));
     }
     return $content;
 }
Example #17
0
?>
快照谨为网络故障时之索引,不代表被搜索网站的即时页面。)</p>
	<hr style="margin:8px 0;width:100%">
  </div>
</td></tr>
</table>

<?php 
//header("location:".$row["url"]."");
$file_name = "www/" . str_replace("http://", "", $url . ".html");
if (file_exists($file_name)) {
    $htmlcode = file_get_contents($file_name);
    //echo $file_name;
}
if (empty($htmlcode)) {
    $spider = new spider();
    $spider->url($url);
    $htmlcode = $spider->htmlcode;
}
$htmlcode = replace_html($htmlcode);
/* foreach(explode(" ",$wd) as $value)
   {
       $htmlcode=str_replace($value,"<font background=#FFFF00>".$value."</font>",$htmlcode);
   }*/
echo $htmlcode;
/*
$fp=@fopen("www/".str_replace("http://","",$url.".html"),"w") or die("写方式打开文件失败,请检查程序目录是否为可写");//配置conn.php文件
@fputs($fp,$htmlcode) or die("文件写入失败,请检查程序目录是否为可写"); 
@fclose($fp);
*/
function replace_html($string)
Example #18
0
function insert_links($url)
{
    global $db, $config;
    $spider = new spider();
    $spider->url($url);
    $links = $spider->links();
    $sites = $spider->sites();
    foreach ($sites as $value) {
        $site_url = GetSiteUrl($link);
        $site = $db->get_one("select * from ve123_sites where url='" . $site_url . "'");
        $site_id = $site["site_id"];
        $row = $db->get_one("select * from ve123_links where url='" . $value . "'");
        if (empty($row) && is_url($value)) {
            echo $value . "<br>";
            $array = array('url' => $value, 'site_id' => $site_id, 'level' => '0');
            $db->insert("ve123_links", $array);
        } else {
            echo "已存在:" . $value . "<br>";
        }
        ob_flush();
        flush();
        //sleep(1);
        $row = $db->get_one("select * from ve123_sites where url='" . $value . "'");
        if (empty($row) && is_url($value)) {
            $array = array('url' => $value, 'spider_depth' => $config["spider_depth"], 'addtime' => time());
            $db->insert("ve123_sites", $array);
        }
    }
    //sleep(1);
    foreach ($links as $value) {
        $row = $db->get_one("select * from ve123_links_temp where url='" . $value . "'");
        if (empty($row) && is_url($value)) {
            $array = array('url' => $value);
            $db->insert("ve123_links_temp", $array);
        }
    }
}
Example #19
0
 public static function crawl($work = NULL, $pid = NULL, $_rid = NULL, $_urls = null, $callback = null)
 {
     $pid === NULL && ($pid = spider::$pid);
     if ($pid) {
         $project = spider::project($pid);
         $cid = $project['cid'];
         $rid = $project['rid'];
         $prule_list_url = $project['list_url'];
         $lastupdate = $project['lastupdate'];
     } else {
         $cid = spider::$cid;
         $rid = spider::$rid;
     }
     if (empty($rid) && $_rid !== NULL) {
         $rid = $_rid;
     }
     if ($work == 'shell') {
         $lastupdate = $project['lastupdate'];
         if ($project['psleep']) {
             if (time() - $lastupdate < $project['psleep']) {
                 echo '采集方案[' . $pid . "]:" . format_date($lastupdate) . "刚采集过了,请" . $project['psleep'] / 3600 . "小时后在继续采集\n";
                 return;
             }
         }
         echo "开始采集方案[" . $pid . "] 采集规则[" . $rid . "]\n";
     }
     $ruleA = spider::rule($rid);
     $rule = $ruleA['rule'];
     $urls = $rule['list_urls'];
     $project['urls'] && ($urls = $project['urls']);
     spiderUrls::$urls && ($urls = spiderUrls::$urls);
     $_urls && ($urls = $_urls);
     $urlsArray = explode("\n", $urls);
     $urlsArray = array_filter($urlsArray);
     $_urlsArray = $urlsArray;
     $urlsList = array();
     if ($work == 'shell') {
         // echo "$urls\n";
         print_r($urlsArray);
     }
     foreach ($_urlsArray as $_key => $_url) {
         $_url = htmlspecialchars_decode($_url);
         $_urlsList = array();
         /**
          * RULE@rid@url
          * url使用[rid]规则采集并返回列表结果
          */
         if (strpos($_url, 'RULE@') !== false) {
             list($___s, $_rid, $_urls) = explode('@', $_url);
             if (spider::$ruleTest) {
                 print_r('<b>使用[rid:' . $_rid . ']规则抓取列表</b>:' . $_urls);
                 echo "<hr />";
             }
             $_urlsList = spiderUrls::crawl($work, false, $_rid, $_urls, 'CALLBACK@URL');
             $urlsList = array_merge($urlsList, $_urlsList);
             unset($urlsArray[$_key]);
         } else {
             preg_match('|.*<(.*)>.*|is', $_url, $_matches);
             if ($_matches) {
                 list($format, $begin, $num, $step, $zeroize, $reverse) = explode(',', $_matches[1]);
                 $url = str_replace($_matches[1], '*', trim($_matches[0]));
                 $_urlsList = spiderTools::mkurls($url, $format, $begin, $num, $step, $zeroize, $reverse);
                 unset($urlsArray[$_key]);
                 $urlsList = array_merge($urlsList, $_urlsList);
             }
         }
     }
     $urlsList && ($urlsArray = array_merge($urlsArray, $urlsList));
     unset($_urlsArray, $_key, $_url, $_matches, $_urlsList, $urlsList);
     $urlsArray = array_unique($urlsArray);
     // spider::$useragent = $rule['user_agent'];
     // spider::$encoding  = $rule['curl']['encoding'];
     // spider::$referer   = $rule['curl']['referer'];
     // spider::$charset   = $rule['charset'];
     if (empty($urlsArray)) {
         if ($work == 'shell') {
             echo "采集列表为空!请填写!\n";
             return false;
         }
         iPHP::alert('采集列表为空!请填写!', 'js:parent.window.iCMS_MODAL.destroy();');
     }
     //      if(spider::$ruleTest){
     //          echo "<pre>";
     //          print_r(iS::escapeStr($project));
     //          print_r(iS::escapeStr($rule));
     //          echo "</pre>";
     //          echo "<hr />";
     //      }
     if ($rule['mode'] == "2") {
         iPHP::import(iPHP_LIB . '/phpQuery.php');
         spider::$ruleTest && $_GET['pq_debug'] && (phpQuery::$debug = 1);
     }
     $pubArray = array();
     $pubCount = array();
     $pubAllCount = array();
     spider::$curl_proxy = $rule['proxy'];
     spider::$urlslast = null;
     foreach ($urlsArray as $key => $url) {
         $url = trim($url);
         spider::$urlslast = $url;
         if ($work == 'shell') {
             echo '开始采集列表:' . $url . "\n";
         }
         if (spider::$ruleTest) {
             echo '<b>抓取列表:</b>' . $url . "<br />";
         }
         $html = spiderTools::remote($url);
         if (empty($html)) {
             continue;
         }
         if ($rule['mode'] == "2") {
             $doc = phpQuery::newDocumentHTML($html, 'UTF-8');
             $list_area = $doc[trim($rule['list_area_rule'])];
             // if(strpos($rule['list_area_format'], 'DOM::')!==false){
             //     $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area);
             // }
             if ($rule['list_area_format']) {
                 $list_area_format = trim($rule['list_area_format']);
                 if (strpos($list_area_format, 'ARRAY::') !== false) {
                     $list_area_format = str_replace('ARRAY::', '', $list_area_format);
                     $lists = array();
                     foreach ($list_area as $la_key => $la) {
                         $lists[] = phpQuery::pq($list_area_format, $la);
                     }
                 } else {
                     $lists = phpQuery::pq($list_area_format, $list_area);
                 }
             } else {
                 $lists = $list_area;
             }
             // $lists = $list_area;
             //echo 'list:getDocumentID:'.$lists->getDocumentID()."\n";
         } else {
             $list_area_rule = spiderTools::pregTag($rule['list_area_rule']);
             if ($list_area_rule) {
                 preg_match('|' . $list_area_rule . '|is', $html, $matches, $PREG_SET_ORDER);
                 $list_area = $matches['content'];
             } else {
                 $list_area = $html;
             }
             $html = null;
             unset($html);
             if (spider::$ruleTest) {
                 echo iS::escapeStr($rule['list_area_rule']);
                 //              echo iS::escapeStr($list_area);
                 echo "<hr />";
             }
             if ($rule['list_area_format']) {
                 $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area);
             }
             preg_match_all('|' . spiderTools::pregTag($rule['list_url_rule']) . '|is', $list_area, $lists, PREG_SET_ORDER);
             $list_area = null;
             unset($list_area);
             if ($rule['sort'] == "1") {
                 //arsort($lists);
             } elseif ($rule['sort'] == "2") {
                 asort($lists);
             } elseif ($rule['sort'] == "3") {
                 shuffle($lists);
             }
         }
         if (spider::$ruleTest) {
             echo '<b>列表区域规则:</b>' . iS::escapeStr($rule['list_area_rule']);
             echo "<hr />";
             echo '<b>列表区域抓取结果:</b>' . iS::escapeStr($list_area);
             echo "<hr />";
             echo '<b>列表链接规则:</b>' . iS::escapeStr($rule['list_url_rule']);
             echo "<hr />";
             echo '<b>网址合成规则:</b>' . iS::escapeStr($rule['list_url']);
             echo "<hr />";
         }
         if ($prule_list_url) {
             $rule['list_url'] = $prule_list_url;
         }
         //PID@xx 返回URL列表
         if ($callback == 'CALLBACK@URL') {
             $cbListUrl = array();
             foreach ($lists as $lkey => $row) {
                 list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url);
                 if (spider::$url === false) {
                     continue;
                 }
                 // if(spider::checker($work)===true){
                 $cbListUrl[] = spider::$url;
                 // }
             }
             return $cbListUrl;
         }
         if ($work == "shell") {
             $pubCount[$url]['count'] = count($lists);
             $pubAllCount['count'] += $pubCount[$url]['count'];
             echo "开始采集:" . $url . " 列表 " . $pubCount[$url]['count'] . "条记录\n";
             foreach ($lists as $lkey => $row) {
                 list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url);
                 if (spider::$url === false) {
                     continue;
                 }
                 $hash = md5(spider::$url);
                 echo "title:" . spider::$title . "\n";
                 echo "url:" . spider::$url . "\n";
                 spider::$rid = $rid;
                 $checker = spider::checker($work);
                 if ($checker === true) {
                     echo "开始采集....";
                     $callback = spider::publish("shell");
                     if ($callback['code'] == "1001") {
                         $pubCount[$url]['success']++;
                         $pubAllCount['success']++;
                         echo "....√\n";
                         if ($project['sleep']) {
                             echo "sleep:" . $project['sleep'] . "s\n";
                             if ($rule['mode'] != "2") {
                                 unset($lists[$lkey]);
                             }
                             gc_collect_cycles();
                             sleep($project['sleep']);
                         } else {
                             //sleep(1);
                         }
                     } else {
                         $pubCount[$url]['error']++;
                         $pubAllCount['error']++;
                         echo "error\n\n";
                         continue;
                     }
                 }
                 $pubCount[$url]['published']++;
                 $pubAllCount['published']++;
             }
             if ($rule['mode'] == "2") {
                 phpQuery::unloadDocuments($doc->getDocumentID());
             } else {
                 unset($lists);
             }
         }
         if ($work == "WEB@MANUAL") {
             $listsArray[$url] = $lists;
         }
         if ($work == "WEB@AUTO" || $work == 'DATA@RULE') {
             foreach ($lists as $lkey => $row) {
                 list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url);
                 if (spider::$url === false) {
                     continue;
                 }
                 $hash = md5(spider::$url);
                 if (spider::$ruleTest) {
                     echo '<b>列表抓取结果:</b>' . $lkey . '<br />';
                     echo spider::$title . ' (<a href="' . APP_URI . '&do=testdata' . '&url=' . urlencode(spider::$url) . '&rid=' . $rid . '&pid=' . $pid . '&title=' . urlencode(spider::$title) . '" target="_blank">测试内容规则</a>) <br />';
                     echo spider::$url . "<br />";
                     echo $hash . "<br /><hr />";
                 } else {
                     if (spider::checker($work) === true || spider::$dataTest) {
                         $suData = array('sid' => 0, 'url' => spider::$url, 'title' => spider::$title, 'cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'hash' => $hash);
                         switch ($work) {
                             case 'DATA@RULE':
                                 $contentArray[$lkey] = spiderData::crawl();
                                 // $contentArray[$lkey] = spiderUrls::crawl($work,$_pid);
                                 unset($suData['sid']);
                                 $suData['title'] = addslashes($suData['title']);
                                 $suData += array('addtime' => time(), 'status' => '2', 'publish' => '2', 'indexid' => '0', 'pubdate' => '0');
                                 spider::$dataTest or $suid = iDB::insert('spider_url', $suData);
                                 $contentArray[$lkey]['spider_url'] = $suid;
                                 break;
                             case 'WEB@AUTO':
                                 $pubArray[] = $suData;
                                 break;
                         }
                     }
                 }
             }
         }
     }
     $lists = null;
     unset($lists);
     gc_collect_cycles();
     switch ($work) {
         case 'WEB@AUTO':
             return $pubArray;
             break;
         case 'DATA@RULE':
             return $contentArray;
             break;
         case 'WEB@MANUAL':
             return array('cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'sid' => $sid, 'work' => $work, 'rule' => $rule, 'listsArray' => $listsArray);
             break;
         case "shell":
             echo "采集数据统结果:\n";
             print_r($pubCount);
             print_r($pubAllCount);
             echo "全部采集完成....\n";
             iDB::update('spider_project', array('lastupdate' => time()), array('id' => $pid));
             break;
     }
 }
Example #20
0
    if (empty($site)) {
        $array = array('url' => $url, 'spider_depth' => $config["spider_depth"], 'indexdate' => time(), 'addtime' => time());
        $db->insert("kuaso_sites", $array);
    }
    $site = $db->get_one("select * from kuaso_sites where url='{$url}'");
    if (!empty($site)) {
        $ip = ip();
        //$referer=$_SERVER['HTTP_REFERER'];
        $v = $db->get_one("select * from kuaso_stat_visitor where v_ip='" . $ip . "' and v_time>='" . (time() - 86400 * 1) . "'");
        if (empty($v)) {
            $array = array('v_time' => time(), 'v_ip' => $ip);
            $db->insert("kuaso_stat_visitor", $array);
            $db->query("update kuaso_sites set com_time='" . time() . "',com_count_ip=com_count_ip+1 where url='" . $url . "'");
        }
    }
    $site = $db->get_one("select * from kuaso_sites where url='{$url}'");
    if (!empty($site)) {
        $row = $db->get_one("select * from kuaso_links where url='" . $url . "'");
        if (empty($row)) {
            $spider = new spider();
            $spider->url($url);
            $title = $spider->title;
            $fulltxt = $spider->fulltxt(800);
            $keywords = $spider->keywords;
            $description = $spider->description;
            $pagesize = $spider->pagesize;
            $array = array('url' => $url, 'title' => $title, 'fulltxt' => $fulltxt, 'pagesize' => $pagesize, 'keywords' => $keywords, 'description' => $description, 'updatetime' => time());
            $db->insert("kuaso_links", $array);
        }
    }
}
Example #21
0
 public static function publish($work = null)
 {
     $_POST = spiderData::crawl();
     if (spider::$work == 'shell') {
         if (empty($_POST['title'])) {
             echo "标题不能为空\n";
             return false;
         }
         if (empty($_POST['body'])) {
             echo "内容不能为空\n";
             return false;
         }
     }
     $checker = spider::checker($work, spider::$pid, $_POST['reurl'], $_POST['title']);
     if ($checker !== true) {
         return $checker;
     }
     $project = spider::project(spider::$pid);
     if (!isset($_POST['cid'])) {
         $_POST['cid'] = $project['cid'];
     }
     $postArgs = spider::postArgs($project['poid']);
     if ($_GET['indexid']) {
         $aid = (int) $_GET['indexid'];
         $_POST['aid'] = $aid;
         $_POST['adid'] = iDB::value("SELECT `id` FROM `#iCMS@__article_data` WHERE aid='{$aid}'");
     }
     $title = iS::escapeStr($_POST['title']);
     $url = iS::escapeStr($_POST['reurl']);
     $hash = md5($url);
     if (empty(spider::$sid)) {
         $spider_url = iDB::row("SELECT `id`,`publish`,`indexid` FROM `#iCMS@__spider_url` where `url`='{$url}'", ARRAY_A);
         if (empty($spider_url)) {
             $spider_url_data = array('cid' => $project['cid'], 'rid' => spider::$rid, 'pid' => spider::$pid, 'title' => addslashes($title), 'url' => $url, 'hash' => $hash, 'status' => '1', 'addtime' => time(), 'publish' => '0', 'indexid' => '0', 'pubdate' => '');
             $suid = iDB::insert('spider_url', $spider_url_data);
         } else {
             if ($spider_url['indexid']) {
                 $_POST['aid'] = $spider_url['indexid'];
                 $_POST['adid'] = iDB::value("SELECT `id` FROM `#iCMS@__article_data` WHERE aid='" . $spider_url['indexid'] . "'");
             }
             $suid = $spider_url['id'];
         }
     } else {
         $suid = spider::$sid;
     }
     if (spider::$callback['post'] && is_callable(spider::$callback['post'])) {
         $_POST = call_user_func_array(spider::$callback['post'], array($_POST));
     }
     iS::slashes($_POST);
     $app = iACP::app($postArgs->app);
     $fun = $postArgs->fun;
     $app->callback['code'] = '1001';
     /**
      * 主表 回调 更新关联ID
      */
     $app->callback['primary'] = array(array('spider', 'update_spider_url_indexid'), array('suid' => $suid));
     /**
      * 数据表 回调 成功发布
      */
     $app->callback['data'] = array(array('spider', 'update_spider_url_publish'), array('suid' => $suid));
     $callback = $app->{$fun}();
     if ($callback['code'] == $app->callback['code']) {
         if (spider::$sid) {
             $work === NULL && iPHP::success("发布成功!", 'js:1');
         } else {
             $work === NULL && iPHP::success("发布成功!", 'js:parent.$("#' . $hash . '").remove();');
         }
     }
     if ($work == "shell" || $work == "WEB@AUTO") {
         $callback['work'] = $work;
         return $callback;
     }
 }
Example #22
0
 function do_addproject()
 {
     $rs = array();
     $this->pid && ($rs = spider::project($this->pid));
     $cid = empty($rs['cid']) ? $this->cid : $rs['cid'];
     $categoryApp = iACP::app('category', iCMS_APP_ARTICLE);
     $cata_option = $categoryApp->select(false, $cid);
     $rule_option = $this->rule_opt($rs['rid']);
     $post_option = $this->post_opt($rs['poid']);
     //$rs['sleep'] OR $rs['sleep'] = 30;
     include iACP::view("spider.addproject");
 }
Example #23
0
function Update_link($url)
{
    global $db, $bug_url;
    $is_success = FALSE;
    $is_shoulu = FALSE;
    $spider = new spider();
    $spider->url($url);
    $title = $spider->title;
    $fulltxt = $spider->fulltxt(800);
    $pagesize = $spider->pagesize;
    $keywords = $spider->keywords;
    $htmlcode = $spider->htmlcode;
    $description = $spider->description;
    $site_url = GetSiteUrl($url);
    $site = $db->get_one("select * from ve123_sites where url='" . $site_url . "'");
    $site_id = $site["site_id"];
    echo $title;
    $array = array('title' => $title, 'fulltxt' => $fulltxt, 'pagesize' => $pagesize, 'keywords' => $keywords, 'description' => $description, 'site_id' => $site_id);
    $db->query("update ve123_links set updatetime='" . time() . "' where url='" . $url . "'");
    if (!empty($title)) {
        $s = array();
        $s = explode("?", $title);
        if ($pagesize > 1 && count($s) < 2) {
            $domain = GetSiteUrl($url);
            $site = $db->get_one("select * from ve123_sites where url='" . $domain . "'");
            if (!empty($site)) {
                if (!empty($site["include_word"])) {
                    foreach (explode(",", $site["include_word"]) as $value) {
                        if (stristr($htmlcode, $value)) {
                            $include_num += 1;
                        }
                    }
                    if ($include_num <= 0) {
                        $is_shoulu = FALSE;
                    }
                } else {
                    $is_shoulu = TRUE;
                }
                if (!empty($site["not_include_word"])) {
                    foreach (explode(",", $site["not_include_word"]) as $value) {
                        if (stristr($htmlcode, $value)) {
                            $not_include_num += 1;
                        }
                    }
                    if ($not_include_num > 0) {
                        $is_shoulu = FALSE;
                    }
                }
            } else {
                $is_shoulu = TRUE;
            }
            if ($is_shoulu) {
                $db->update("ve123_links", $array, "url='" . $url . "'");
                //file_put_contents(PATH."k/www/".str_replace("http://","",$url.".html"),$htmlcode);
                $is_success = TRUE;
            }
        }
    }
    if (empty($bug_url)) {
        exit;
    }
    return $is_success;
}