<?php ini_set("memory_limit", "1024M"); require dirname(__FILE__) . '/../core/init.php'; /* Do NOT delete this comment */ /* 不要删除这段注释 */ $configs = array('name' => '罗辑思维', 'tasknum' => 16, 'domains' => array('luofans.com', 'www.luofans.com'), 'scan_urls' => array('http://www.luofans.com/audios'), 'list_url_regexes' => array("http://www.luofans.com/audios\\?offset=\\d+&max=10&sort=publishAt&order=desc"), 'content_url_regexes' => array("http://www.luofans.com/audios/\\d+"), 'max_try' => 5, 'export' => array('type' => 'db', 'table' => 'luojisiwei_content'), 'fields' => array(array('name' => "content", 'selector' => "//div[contains(@class,'article-content')]", 'required' => true), array('name' => "date", 'selector' => "//li[contains(@class,'active')]", 'required' => true))); $spider = new phpspider($configs); $spider->on_extract_field = function ($fieldname, $data, $page) { if ($fieldname == 'content') { $data = strip_tags($data); } return $data; }; $spider->start();
/** * 分析提取HTML页面中的字段 * * @param mixed $html * @return void * @author seatle <*****@*****.**> * @created time :2016-09-18 10:17 */ public function get_html_fields($html, $url, $page) { $fields = $this->get_fields(self::$configs['fields'], $html, $url, $page); if (!empty($fields)) { if ($this->on_extract_page) { $return_data = call_user_func($this->on_extract_page, $page, $fields); if (!isset($return_data)) { log::warn("on_extract_page function return value can't be empty\n"); } elseif (!is_array($return_data)) { log::warn("on_extract_page function return value must be an array\n"); } else { $fields = $return_data; } } if (isset($fields) && is_array($fields)) { $fields_num = $this->incr_fields_num(); $fields_str = json_encode($fields, JSON_UNESCAPED_UNICODE); //if (isset(self::$configs['show_encoding']) && strtolower(self::$configs['show_encoding']) != 'utf-8') //{ //$fields_str = mb_convert_encoding($fields_str, self::$configs['show_encoding'], 'utf-8'); //} if (util::is_win()) { $fields_str = mb_convert_encoding($fields_str, 'gb2312', 'utf-8'); } log::info(date("H:i:s") . " Result[{$fields_num}]: " . $fields_str . "\n"); // 如果设置了导出选项 if (!empty(self::$configs['export'])) { self::$export_type = isset(self::$configs['export']['type']) ? self::$configs['export']['type'] : ''; if (self::$export_type == 'csv') { util::put_file(self::$export_file, util::format_csv($fields) . "\n", FILE_APPEND); } elseif (self::$export_type == 'sql') { $sql = db::insert(self::$export_table, $fields, true); util::put_file(self::$export_file, $sql . ";\n", FILE_APPEND); } elseif (self::$export_type == 'db') { db::insert(self::$export_table, $fields); } } } } }
<?php ini_set("memory_limit", "1024M"); require dirname(__FILE__) . '/../core/init.php'; /* Do NOT delete this comment */ /* 不要删除这段注释 */ $spider = new phpspider(); $spider->on_attachment_file = function ($url, $filetype, $phpspider) { // 输出文件URL地址和文件类型 //var_dump($url, $filetype); if ($filetype == 'jpg') { // 以纳秒为单位生成随机数 $filename = uniqid(); // 在data目录下生成图片 $filepath = PATH_DATA . "/{$filename}.jpg"; // 用系统自带的下载器wget下载 exec("wget {$url} -O {$filepath}"); // 用PHP函数下载,容易耗尽内存,慎用 //$data = file_get_contents($attachment_url); //file_put_contents($filepath, $attachment_url); } }; $url = "http://ocnt0imhl.bkt.clouddn.com/imgs/1637/2015-07/k306n1wzvkq669nm.jpg"; $url = "http://www.epooll.com/archives/806/"; $spider->request_url($url);