Beispiel #1
0
<?php

ini_set("memory_limit", "1024M");
require dirname(__FILE__) . '/../core/init.php';
/* Do NOT delete this comment */
/* 不要删除这段注释 */
$configs = array('name' => '罗辑思维', 'tasknum' => 16, 'domains' => array('luofans.com', 'www.luofans.com'), 'scan_urls' => array('http://www.luofans.com/audios'), 'list_url_regexes' => array("http://www.luofans.com/audios\\?offset=\\d+&amp;max=10&amp;sort=publishAt&amp;order=desc"), 'content_url_regexes' => array("http://www.luofans.com/audios/\\d+"), 'max_try' => 5, 'export' => array('type' => 'db', 'table' => 'luojisiwei_content'), 'fields' => array(array('name' => "content", 'selector' => "//div[contains(@class,'article-content')]", 'required' => true), array('name' => "date", 'selector' => "//li[contains(@class,'active')]", 'required' => true)));
$spider = new phpspider($configs);
$spider->on_extract_field = function ($fieldname, $data, $page) {
    if ($fieldname == 'content') {
        $data = strip_tags($data);
    }
    return $data;
};
$spider->start();
Beispiel #2
0
 /**
  * 分析提取HTML页面中的字段
  * 
  * @param mixed $html
  * @return void
  * @author seatle <*****@*****.**> 
  * @created time :2016-09-18 10:17
  */
 public function get_html_fields($html, $url, $page)
 {
     $fields = $this->get_fields(self::$configs['fields'], $html, $url, $page);
     if (!empty($fields)) {
         if ($this->on_extract_page) {
             $return_data = call_user_func($this->on_extract_page, $page, $fields);
             if (!isset($return_data)) {
                 log::warn("on_extract_page function return value can't be empty\n");
             } elseif (!is_array($return_data)) {
                 log::warn("on_extract_page function return value must be an array\n");
             } else {
                 $fields = $return_data;
             }
         }
         if (isset($fields) && is_array($fields)) {
             $fields_num = $this->incr_fields_num();
             $fields_str = json_encode($fields, JSON_UNESCAPED_UNICODE);
             //if (isset(self::$configs['show_encoding']) && strtolower(self::$configs['show_encoding']) != 'utf-8')
             //{
             //$fields_str = mb_convert_encoding($fields_str, self::$configs['show_encoding'], 'utf-8');
             //}
             if (util::is_win()) {
                 $fields_str = mb_convert_encoding($fields_str, 'gb2312', 'utf-8');
             }
             log::info(date("H:i:s") . " Result[{$fields_num}]: " . $fields_str . "\n");
             // 如果设置了导出选项
             if (!empty(self::$configs['export'])) {
                 self::$export_type = isset(self::$configs['export']['type']) ? self::$configs['export']['type'] : '';
                 if (self::$export_type == 'csv') {
                     util::put_file(self::$export_file, util::format_csv($fields) . "\n", FILE_APPEND);
                 } elseif (self::$export_type == 'sql') {
                     $sql = db::insert(self::$export_table, $fields, true);
                     util::put_file(self::$export_file, $sql . ";\n", FILE_APPEND);
                 } elseif (self::$export_type == 'db') {
                     db::insert(self::$export_table, $fields);
                 }
             }
         }
     }
 }
Beispiel #3
0
<?php

ini_set("memory_limit", "1024M");
require dirname(__FILE__) . '/../core/init.php';
/* Do NOT delete this comment */
/* 不要删除这段注释 */
$spider = new phpspider();
$spider->on_attachment_file = function ($url, $filetype, $phpspider) {
    // 输出文件URL地址和文件类型
    //var_dump($url, $filetype);
    if ($filetype == 'jpg') {
        // 以纳秒为单位生成随机数
        $filename = uniqid();
        // 在data目录下生成图片
        $filepath = PATH_DATA . "/{$filename}.jpg";
        // 用系统自带的下载器wget下载
        exec("wget {$url} -O {$filepath}");
        // 用PHP函数下载,容易耗尽内存,慎用
        //$data = file_get_contents($attachment_url);
        //file_put_contents($filepath, $attachment_url);
    }
};
$url = "http://ocnt0imhl.bkt.clouddn.com/imgs/1637/2015-07/k306n1wzvkq669nm.jpg";
$url = "http://www.epooll.com/archives/806/";
$spider->request_url($url);