Esempio n. 1
0
/**
 * 截取字符串,用...代替
 */
function sub_str($str, $number)
{
    if (strlen_utf8($str) > $number) {
        return substr_utf8($str, 0, $number) . '...';
    }
    return $str;
}
 /**
  * 生成描述,注意:必须先生成标题!
  */
 function GenerateDescription()
 {
     $arrKeys = $this->arrKeys;
     $arrKeys = $this->JumbleArrayUseFeature($arrKeys);
     $exKey = $arrKeys[0];
     $arrDescription = array_slice($this->arrContent, 0, 5);
     $Description = implode('|', $arrDescription) . '|' . $this->key . '|' . $exKey;
     $arrDescription = explode('|', $Description);
     //描述部分用随机就行了。没必要伪随机
     shuffle($arrDescription);
     $Description = implode('', $arrDescription);
     $Description = replace_dbs($Description, '.');
     $this->descriptions = substr_utf8($Description, 0, 1000);
 }
Esempio n. 3
0
/**
 * Smarty truncate modifier plugin
 *
 * Type:     modifier<br>
 * Name:     utf8_truncate<br>
 * @author  jack.z
 * @param string
 * @param integer
 * @param string
 * @param boolean
 * @param boolean
 * @return string
 */
function smarty_modifier_utf8_truncate($string, $length = 80, $etc = '...', $break_words = false, $middle = false)
{
    return $returnstr = substr_utf8(StripHTML($string), 0, $length) . $etc;
}
Esempio n. 4
0
function out_view($arr, $arr_view, $domain, $base_url)
{
    global $cfg_tk_pid;
    $con = file_get_contents("./templets/view.html");
    /* =================
     * 生成链接
     * =================
     *
     * 生成相关链接
     * $key_link:由关键词列表生成关键词连接集
     */
    $arr_keys = explode(",", $arr['keys']);
    //生成相关关键词链接
    $key_link = '';
    //由关键词动态生成,不记录在cache,但要输出
    foreach ($arr_keys as $kk) {
        $key_epath = get_epath($kk);
        $arr_temp = explode("/", $key_epath);
        $key_link .= '<li><a href="http://' . $arr_temp[0] . $domain . '/' . $arr_temp[1] . '">' . $kk . '</a></li>';
    }
    /*
     * $tag_link:生成tag链接,包含随机部分及互连部分
     */
    $tag_link = $arr['flink'];
    //生成tag链接,包含随机部分及互连部分(数据库给出)
    $arr_randkey = explode(',', $arr['tag_key']);
    foreach ($arr_randkey as $kk) {
        $key_epath = get_epath($kk);
        $arr_temp = explode("/", $key_epath);
        $tag_link .= '<li><a href="http://' . $arr_temp[0] . $domain . '/' . $arr_temp[1] . '">' . $kk . '</a></li>';
    }
    $base = str_replace('/x', '', $base_url . 'x');
    /*
     * 替换数组
     */
    $arr_rep = array();
    $arr_rep['key'] = $arr['key'];
    $arr_rep['title'] = $arr_view['title'];
    $arr_rep['maintitle'] = $arr['title'];
    $description = preg_replace("%<[^>]+>%", ',', $arr_view['description']);
    $description = substr_utf8($description, 0, 50);
    $arr_rep['miaoshu'] = $description;
    $arr_rep['base_url'] = $base;
    $arr_rep['body'] = $arr_view['body'] . "<br>" . "您可能感兴趣:<br>" . $arr_view['x_link'];
    if ($arr_rep['body'] == '') {
        //如果尚未生成文章内容
        $arr_rep['body'] = '暂无数据,请等待。错误编码:' . md5($base_url);
    }
    $arr_rep['adkey'] = $arr['adkey'];
    $arr_rep['adkey_code'] = urlencode($arr['adkey']);
    $arr_rep['adkey_iconv'] = urlencode(iconv("UTF-8", "GBK//IGNORE", $arr['adkey']));
    $arr_rep['pid'] = $cfg_tk_pid;
    $arr_rep['keys_link'] = $key_link;
    $arr_rep['tag_link'] = $tag_link;
    /*
     * 替换
     */
    foreach ($arr_rep as $k => $v) {
        $con = str_replace('{' . $k . '}', $v, $con);
    }
    echo $con;
    include './fun/robot.php';
}
function truncate($text, $length = 200, $ending = '...', $exact = true, $considerHtml = true)
{
    if ($considerHtml) {
        // if the plain text is shorter than the maximum length, return the whole text
        if (strlen_utf8(preg_replace('/<.*?>/u', '', $text)) <= $length) {
            return $text;
        }
        // splits all html-tags to scanable lines
        preg_match_all('/(<.+?>)?([^<>]*)/su', $text, $lines, PREG_SET_ORDER);
        $total_length = strlen_utf8($ending);
        $open_tags = array();
        $truncate = '';
        foreach ($lines as $line_matchings) {
            // if there is any html-tag in this line, handle it and add it (uncounted) to the output
            if (!empty($line_matchings[1])) {
                // if it's an "empty element" with or without xhtml-conform closing slash (f.e. <br/>)
                if (preg_match('/^<(\\s*.+?\\/\\s*|\\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param)(\\s.+?)?)>$/isu', $line_matchings[1])) {
                    // do nothing
                } else {
                    if (preg_match('/^<\\s*\\/([^\\s]+?)\\s*>$/us', $line_matchings[1], $tag_matchings)) {
                        // delete tag from $open_tags list
                        $pos = array_search($tag_matchings[1], $open_tags);
                        if ($pos !== false) {
                            unset($open_tags[$pos]);
                        }
                    } else {
                        if (preg_match('/^<\\s*([^\\s>!]+).*?>$/su', $line_matchings[1], $tag_matchings)) {
                            // add tag to the beginning of $open_tags list
                            array_unshift($open_tags, strtolower($tag_matchings[1]));
                        }
                    }
                }
                // add html-tag to $truncate'd text
                $truncate .= $line_matchings[1];
            }
            // calculate the length of the plain text part of the line; handle entities as one character
            $content_length = strlen_utf8(preg_replace('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/ui', ' ', $line_matchings[2]));
            if ($total_length + $content_length > $length) {
                // the number of characters which are left
                $left = $length - $total_length;
                $entities_length = 0;
                // search for html entities
                if (preg_match_all('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/iu', $line_matchings[2], $entities, PREG_OFFSET_CAPTURE)) {
                    // calculate the real length of all entities in the legal range
                    foreach ($entities[0] as $entity) {
                        if ($entity[1] + 1 - $entities_length <= $left) {
                            $left--;
                            $entities_length += strlen_utf8($entity[0]);
                        } else {
                            // no more characters left
                            break;
                        }
                    }
                }
                $truncate .= substr_utf8($line_matchings[2], 0, $left + $entities_length);
                // maximum lenght is reached, so get off the loop
                break;
            } else {
                $truncate .= $line_matchings[2];
                $total_length += $content_length;
            }
            // if the maximum length is reached, get off the loop
            if ($total_length >= $length) {
                break;
            }
        }
    } else {
        if (strlen_utf8($text) <= $length) {
            return $text;
        } else {
            $truncate = substr_utf8($text, 0, $length - strlen_utf8($ending));
        }
    }
    // if the words shouldn't be cut in the middle...
    if (!$exact) {
        // ...search the last occurance of a space...
        $spacepos = strrpos($truncate, ' ');
        if (isset($spacepos)) {
            // ...and cut the text in this position
            $truncate = substr_utf8($truncate, 0, $spacepos);
        }
    }
    // add the defined ending to the text
    $truncate .= $ending;
    if ($considerHtml) {
        // close all unclosed html-tags
        foreach ($open_tags as $tag) {
            $truncate .= '</' . $tag . '>';
        }
    }
    return $truncate;
}
Esempio n. 6
0
 /**
  * 抓取文章
  */
 function CrawlView()
 {
     if (count($this->listNotCrawl) < 1) {
         //全部抓取完毕,不再抓取
         //$this->isUp=FALSE;
         return;
     }
     if ($this->errorTime > 6) {
         //错误超过6次,跳过
         array_splice($this->listNotCrawl, 0, 1);
         $this->errorTime = 0;
         $this->isUp = TRUE;
         return;
     }
     $articleId = $this->listNotCrawl[0];
     $data = CurlGet('http://zhidao.baidu.com/question/' . $articleId . '.html');
     $data = iconv("GBK", "UTF-8", $data);
     //标题
     $title = '';
     if (preg_match('%<title>([^>]+?)_%sim', $data, $arr)) {
         $title = strip_tags($arr[1]);
     }
     if ($title == '') {
         $this->isUp = TRUE;
         $this->errorTime += 1;
         return;
     }
     /*描述
       $question_description='';
       if(preg_match('%<pre id="question-content">([^<]+)</pre>%sim',$data,$arr)){
           $question_description=trim($arr[1]);
       }*/
     //最佳答案
     $answers = '';
     if (preg_match('%<pre id="best-answer-content[^>]+?>([^<]+?)</pre>%sim', $data, $arr)) {
         $answers[] = str_replace(array('\\r', '\\n', '\\r\\n'), '<br>', strip_tags(trim($arr[1])));
     }
     //其他答案
     if (preg_match_all('%<pre class="reply[^>]+?>([^<]+?)</pre>%sim', $data, $arr)) {
         foreach ($arr[1] as $r) {
             $answers[] = str_replace(array('\\r', '\\n', '\\r\\n'), '<br>', strip_tags(trim($r)));
         }
     }
     $data = null;
     $arr = null;
     unset($data, $arr);
     $arrBody = explode('<br>', $this->body);
     $arrBody = array_merge($arrBody, $answers);
     if (count($arrBody) <= 1) {
         $body = $arrBody[0];
     } else {
         shuffle($arrBody);
         $body = implode('<br>', $arrBody);
     }
     $body = str_replace($title, '', $body);
     $arrBody = explode('<br>', $body);
     shuffle($arrBody);
     $body = implode('<br>', $arrBody);
     if ($this->title == '') {
         $this->title = $title;
     }
     $this->errorTime = 0;
     if (strlen($body) > 20000) {
         //body达到20K,不再抓取。
         $body = substr_utf8($body, 0, 20000);
         $this->body = $body;
         $body = null;
         $this->listNotCrawl = array();
         $this->isUp = TRUE;
         return;
     }
     $this->body = $body;
     $body = null;
     array_splice($this->listNotCrawl, 0, 1);
     //去掉成功的文章
     $this->isUp = TRUE;
 }
Esempio n. 7
0
function randart($key, $keys, $a_no, $base_url)
{
    global $cfg_xs_url;
    $url = 'http://' . $base_url . 'read-' . $a_no . 'html';
    $md5_url = md5($url);
    /*
     * 生成指纹序列
     * $art_sp:文章指纹序列
     * $choas_sp:乱序指纹序列
     */
    for ($i = 0; $i < 5; $i++) {
        //标题指纹序列
        $art_sp[$i] = toTen(substr($md5_url, $i * 4, 4)) + 1;
    }
    $choas_sp = str_split(toTen(substr($md5_url, 0, 8)) . toTen(substr($md5_url, 8, 8)) . toTen(substr($md5_url, 16, 8)) . toTen(substr($md5_url, 24, 8)), 1);
    /*
     * 副关键词
     */
    $arr_keys = explode(",", $keys);
    $arr_keys = arr_choas($arr_keys, $choas_sp);
    $key2 = $arr_keys[0];
    /*
     * 确定标题
     * 先从5个文本组合副关键词组合打乱一次
     * 再提取20字符,加入主关键词打乱一次
     */
    $title = '';
    $des = '';
    for ($i = 0; $i < 5; $i++) {
        $temp = file_get_contents($cfg_xs_url . $art_sp[$i] . ".txt");
        $title .= $temp . '|' . $key2;
        $des .= $temp;
    }
    $arr_title = explode("|", $title);
    $arr_title = arr_choas($arr_title, $choas_sp);
    //按指纹乱序标题
    $title = implode('|', $arr_title);
    $title = preg_replace("%<[^>]+>%", ',', $title);
    //防止网址中字母被匹配
    $title = str_replace(array(',', '。', ':', '”', '“', '!', '……', '?', ';', ' ', ','), '', $title);
    $title = replace_db($title, '');
    $title = substr_utf8($title, 0, 20);
    $arr_title = explode("|", $title . '|' . $key);
    $arr_title = arr_choas($arr_title, $choas_sp);
    $title = implode('', $arr_title);
    /*
     *  确定描述
     */
    if (rand(0, 10) > 7) {
        $arr_des = explode('|', $des . '|' . $key2 . '|' . $key);
    } else {
        $arr_des = explode('|', $des . '|' . $key);
    }
    shuffle($arr_des);
    $des = implode('', $arr_des);
    $des = replace_dbs($des, '.');
    $des = substr_utf8($des, 0, 500);
    /*
     * 返回
     * $arr_res[]
     */
    $arr_res = array();
    $arr_res['title'] = $title;
    $arr_res['des'] = $des;
    return $arr_res;
}
Esempio n. 8
0
function insertart($key, $keys, $a_no, $base_url)
{
    global $cfg_xs_url;
    $url = 'http://' . $base_url . 'reads-' . $a_no . 'html';
    $md5_url = md5($url);
    /*
     * 生成指纹序列
     * $body_sp:正文指纹序列
     * $choas_sp:乱序指纹序列
     */
    for ($i = 0; $i < 7; $i++) {
        //标题取第1个,文章28个指纹序列
        $body_sp[$i] = toTen(substr($md5_url, $i * 4, 4)) + 1;
        $body_sp[$i + 7] = toTen(substr($md5_url, $i * 4 + 1, 4)) + 1;
        $body_sp[$i + 14] = toTen(substr($md5_url, $i * 4 + 2, 4)) + 1;
        $body_sp[$i + 21] = toTen(substr($md5_url, $i * 4 + 3, 4)) + 1;
    }
    $choas_sp = str_split(toTen(substr($md5_url, 0, 8)) . toTen(substr($md5_url, 8, 8)) . toTen(substr($md5_url, 16, 8)) . toTen(substr($md5_url, 24, 8)), 1);
    /*
     * 副关键词
     */
    $arr_keys = explode(",", $keys);
    $arr_keys = arr_choas($arr_keys, $choas_sp);
    $key2 = $arr_keys[0];
    /*
     * 确定标题
     * 选取第一个文章。
     * 按指纹插入关键词。
     */
    $title = file_get_contents($cfg_xs_url . $body_sp[0] . ".txt");
    $body = $title;
    $arr_title = explode("|", $title);
    $arr_key = array($key, $key2);
    $title = arr_insertkey($arr_title, $choas_sp, $arr_key, 1);
    $title = preg_replace("%<[^>]+>%", ',', $title);
    //防止网址中字母被匹配
    $title = str_replace(array(',', '。', ':', '”', '“', '!', '……', '?', ';', ' ', ','), '', $title);
    $title = replace_db($title, '');
    $title = substr_utf8($title, 0, 20);
    /*
     * 确定描述
     * 选取28个文章。
     * 按指纹插入关键词。
     */
    for ($i = 0; $i < 28; $i++) {
        $temp = file_get_contents($cfg_xs_url . $body_sp[$i] . ".txt");
        $body .= '|' . $temp;
        if ($i % 5 > 3) {
            $body .= '<br>';
        }
    }
    $arr_body = explode('|', $body);
    $arr_key = explode(",", $key . ',' . $keys . ',' . $key . ',' . $key . ',' . $key);
    $body = arr_insertkey($arr_body, $choas_sp, $arr_key, 10);
    $body = replace_dbs($body, '.');
    $body = str_replace(' ', '', $body);
    //$body=substr_utf8($body,0,500);
    /*
     * 返回
     * $arr_res[]
     */
    $arr_res = array();
    $arr_res['title'] = $title;
    $arr_res['body'] = $body;
    return $arr_res;
}
Esempio n. 9
0
<?php

header('Content-type:text/html;charset=utf-8');
function substr_utf8($str, $start, $length = null)
{
    $arr = preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY);
    var_dump($arr);
}
$str = '传智播客PHP学院';
substr_utf8($str);
Esempio n. 10
0
function readart($key, $keys, $a_no, $base_url)
{
    global $cfg_xs_url;
    $url = 'http://' . $base_url . 'read-' . $a_no . 'html';
    $md5_url = md5($url);
    /*
     * 生成指纹序列
     * $art_sp:文章指纹序列
     * $body_sp:正文指纹序列
     * $choas_sp:乱序指纹序列
     */
    for ($i = 0; $i < 7; $i++) {
        //标题5个,文章28个指纹序列
        $body_sp[$i] = toTen(substr($md5_url, $i * 4, 4)) + 1;
        $body_sp[$i + 7] = toTen(substr($md5_url, $i * 4 + 1, 4)) + 1;
        $body_sp[$i + 14] = toTen(substr($md5_url, $i * 4 + 2, 4)) + 1;
        $body_sp[$i + 21] = toTen(substr($md5_url, $i * 4 + 3, 4)) + 1;
        if ($i < 5) {
            $art_sp[$i] = $body_sp[$i];
        }
        //标题文章序列
    }
    $choas_sp = str_split(toTen(substr($md5_url, 0, 8)) . toTen(substr($md5_url, 8, 8)) . toTen(substr($md5_url, 16, 8)) . toTen(substr($md5_url, 24, 8)), 1);
    /*
     * 副关键词
     */
    $arr_keys = explode(",", $keys);
    $arr_keys = arr_choas($arr_keys, $choas_sp);
    $key2 = $arr_keys[0];
    /*
     * 确定标题
     * 先从5个文本组合副关键词组合打乱一次
     * 再提取20字符,加入主关键词打乱一次
     */
    $title = '';
    $des = '';
    for ($i = 0; $i < 5; $i++) {
        $temp = file_get_contents($cfg_xs_url . $art_sp[$i] . ".txt");
        $title .= $temp . '|' . $key2;
    }
    $body = $title;
    $arr_title = explode("|", $title);
    $arr_title = arr_choas($arr_title, $choas_sp);
    //按指纹乱序标题
    $title = implode('|', $arr_title);
    $title = preg_replace("%<[^>]+>%", ',', $title);
    //防止网址中字母被匹配
    $title = str_replace(array(',', '。', ':', '”', '“', '!', '……', '?', ';', ' ', ','), '', $title);
    $title = replace_db($title, '');
    $title = substr_utf8($title, 0, 20);
    $arr_title = explode("|", $title . '|' . $key);
    $arr_title = arr_choas($arr_title, $choas_sp);
    $title = implode('', $arr_title);
    /*
     *  确定正文
     */
    for ($i = 5; $i < 28; $i++) {
        $temp = file_get_contents($cfg_xs_url . $body_sp[$i] . ".txt");
        $body .= '|' . $temp;
        if ($i < 16) {
            $body .= '|' . $key . '|' . '。<br>';
        }
    }
    $arr_body = explode("|", $body);
    $arr_body = arr_choas($arr_body, $choas_sp);
    $body = implode('', $arr_body) . '。';
    $body = replace_dbs($body, '。');
    /*
     * 返回
     * $arr_res[]
     */
    $arr_res = array();
    $arr_res['title'] = $title;
    $arr_res['body'] = $body;
    return $arr_res;
}
Esempio n. 11
0
{
    return join("", array_slice(preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY), $start, $lenth));
}
$str = '贾智超敬爱之巢';
echo substr_utf8($str, 2, 1);
echo "<br/>-----------------------------------------------------------------------------<br/>";
$str1 = '一二三四五六七吧九十';
echo mb_substr($str1, 2, 3);
echo "<br/>-----------------------------------------------------------------------------<br/>";
/**
 * 求解字符串的字符数,
 */
$a = '中国2北333京';
echo mb_strlen($a, 'utf8');
//获取字符串长度
echo "<br/>";
echo mb_substr($a, 3, 1, 'utf8');
//截取字符串
/**
 * 自定义函数实现求其长度
 * @param $str String要计算的字符串
 * @return int 字符串的长度
 */
echo "<br/>";
function strlen_utf8($str)
{
    return count(preg_split('//u', $str, -1, PREG_SPLIT_NO_EMPTY));
}
$a = '中国2北343433京';
echo substr_utf8($a, 3, 1);
Esempio n. 12
0
//生成taglink
$tagLink = '';
$arrTagKey = explode(',', $indexCache->tagKey);
foreach ($arrTagKey as $v) {
    $tagEPath = GetEPath($v);
    $arr = explode('/', $tagEPath);
    $tagLink .= '<li><a href="http://' . $arr[0] . $indexCache->mainDomain . '/' . $arr[1] . '">' . $v . '</li>';
}
//替换数组
$arrReplace = array();
$arrReplace['key'] = $indexCache->key;
$arrReplace['keys'] = $indexCache->keys;
$arrReplace['title'] = $baidu->title;
$arrReplace['maintitle'] = $indexCache->title;
$arrReplace['adkey'] = $indexCache->adKey;
$arrReplace['keyslink'] = $keysLink;
$arrReplace['taglink'] = $tagLink;
if ($baidu->body == '') {
    $arrReplace['body'] = '暂无数据,请等待。错误编码:' . md5($indexCache->baseUrl . $viewID);
} else {
    $arrReplace['body'] = $baidu->body;
}
$arrReplace['body'] .= '您可能感兴趣:<br>' . $baidu->links;
$arrReplace['baseurl'] = $indexCache->baseUrl;
$description = preg_replace("%<[^>]+>%", ',', $baidu->description);
$description = substr_utf8($description, 0, 50);
$arrReplace['description'] = $description;
foreach ($arrReplace as $k => $v) {
    $templets = str_replace('{' . $k . '}', $v, $templets);
}
echo $templets;
Esempio n. 13
0
function messagecutstr($str, $length = 0, $dot = ' ...')
{
    $str = messagesafeclear($str);
    $sppos = strpos($str, chr(0) . chr(0) . chr(0));
    if ($sppos !== false) {
        $str = substr($str, 0, $sppos);
    }
    #loadcache(array('bbcodes_display', 'bbcodes', 'smileycodes', 'smilies', 'smileytypes', 'domainwhitelist'));
    $bbcodes = 'b|i|u|p|color|size|font|align|list|indent|float';
    $bbcodesclear = 'email|code|free|table|tr|td|img|swf|flash|attach|media|audio|groupid|payto';
    $str = preg_replace("/\\[i=?.*?\\](.*?)\\[\\/i\\]/is", '', $str);
    $str = strip_tags(preg_replace(array("/\\[hide=?\\d*\\](.*?)\\[\\/hide\\]/is", "/\\[quote](.*?)\\[\\/quote]/si", "/\\[url=?.*?\\](.+?)\\[\\/url\\]/si", "/\\[({$bbcodesclear})=?.*?\\](.*?)\\[\\/\\1\\]/si", "/\\[({$bbcodes})=?.*?\\]/i", "/\\[\\/({$bbcodes})\\]/i"), array('', '', '\\1', '', '', ''), $str));
    if ($length) {
        //        $str = cutstr($str, $length, $dot);
        $str = substr_utf8($str, 0, $length);
    }
    #$str = preg_replace($_G['cache']['smilies']['searcharray'], '', $str);
    return trim($str);
}