示例#1
0
<?php

ini_set("memory_limit", "1024M");
require dirname(__FILE__) . '/../core/init.php';
/* Do NOT delete this comment */
/* 不要删除这段注释 */
$configs = array('name' => '糗事百科', 'log_show' => true, 'tasknum' => 5, 'domains' => array('qiushibaike.com', 'www.qiushibaike.com'), 'scan_urls' => array('http://www.qiushibaike.com/'), 'list_url_regexes' => array("http://www.qiushibaike.com/8hr/page/\\d+\\?s=\\d+"), 'content_url_regexes' => array("http://www.qiushibaike.com/article/\\d+"), 'max_try' => 5, 'export' => array('type' => 'db', 'table' => 'content'), 'fields' => array(array('name' => "article_title", 'selector' => "//*[@id='single-next-link']//div[contains(@class,'content')]/text()[1]", 'required' => true), array('name' => "article_author", 'selector' => "//div[contains(@class,'author')]//h2", 'required' => true), array('name' => "article_headimg", 'selector' => "//div[contains(@class,'author')]//a[1]", 'required' => true), array('name' => "article_content", 'selector' => "//*[@id='single-next-link']//div[contains(@class,'content')]", 'required' => true), array('name' => "article_publish_time", 'selector' => "//div[contains(@class,'author')]//h2", 'required' => true), array('name' => "url", 'selector' => "//div[contains(@class,'author')]//h2", 'required' => true)));
$spider = new phpspider($configs);
$spider->on_start = function ($phpspider) {
    //requests::add_header("Referer", "http://buluo.qq.com/p/index.html");
    requests::add_cookie("name", "yangzetao");
};
$spider->on_handle_img = function ($fieldname, $img) {
    $regex = '/src="(https?:\\/\\/.*?)"/i';
    preg_match($regex, $img, $rs);
    if (!$rs) {
        return $img;
    }
    $url = $rs[1];
    $img = $url;
    //$pathinfo = pathinfo($url);
    //$fileext = $pathinfo['extension'];
    //if (strtolower($fileext) == 'jpeg')
    //{
    //$fileext = 'jpg';
    //}
    //// 以纳秒为单位生成随机数
    //$filename = uniqid().".".$fileext;
    //// 在data目录下生成图片
    //$filepath = PATH_ROOT."/images/{$filename}";
    //// 用系统自带的下载器wget下载