/** * 截取字符串,用...代替 */ function sub_str($str, $number) { if (strlen_utf8($str) > $number) { return substr_utf8($str, 0, $number) . '...'; } return $str; }
/** * 生成描述,注意:必须先生成标题! */ function GenerateDescription() { $arrKeys = $this->arrKeys; $arrKeys = $this->JumbleArrayUseFeature($arrKeys); $exKey = $arrKeys[0]; $arrDescription = array_slice($this->arrContent, 0, 5); $Description = implode('|', $arrDescription) . '|' . $this->key . '|' . $exKey; $arrDescription = explode('|', $Description); //描述部分用随机就行了。没必要伪随机 shuffle($arrDescription); $Description = implode('', $arrDescription); $Description = replace_dbs($Description, '.'); $this->descriptions = substr_utf8($Description, 0, 1000); }
/** * Smarty truncate modifier plugin * * Type: modifier<br> * Name: utf8_truncate<br> * @author jack.z * @param string * @param integer * @param string * @param boolean * @param boolean * @return string */ function smarty_modifier_utf8_truncate($string, $length = 80, $etc = '...', $break_words = false, $middle = false) { return $returnstr = substr_utf8(StripHTML($string), 0, $length) . $etc; }
function out_view($arr, $arr_view, $domain, $base_url) { global $cfg_tk_pid; $con = file_get_contents("./templets/view.html"); /* ================= * 生成链接 * ================= * * 生成相关链接 * $key_link:由关键词列表生成关键词连接集 */ $arr_keys = explode(",", $arr['keys']); //生成相关关键词链接 $key_link = ''; //由关键词动态生成,不记录在cache,但要输出 foreach ($arr_keys as $kk) { $key_epath = get_epath($kk); $arr_temp = explode("/", $key_epath); $key_link .= '<li><a href="http://' . $arr_temp[0] . $domain . '/' . $arr_temp[1] . '">' . $kk . '</a></li>'; } /* * $tag_link:生成tag链接,包含随机部分及互连部分 */ $tag_link = $arr['flink']; //生成tag链接,包含随机部分及互连部分(数据库给出) $arr_randkey = explode(',', $arr['tag_key']); foreach ($arr_randkey as $kk) { $key_epath = get_epath($kk); $arr_temp = explode("/", $key_epath); $tag_link .= '<li><a href="http://' . $arr_temp[0] . $domain . '/' . $arr_temp[1] . '">' . $kk . '</a></li>'; } $base = str_replace('/x', '', $base_url . 'x'); /* * 替换数组 */ $arr_rep = array(); $arr_rep['key'] = $arr['key']; $arr_rep['title'] = $arr_view['title']; $arr_rep['maintitle'] = $arr['title']; $description = preg_replace("%<[^>]+>%", ',', $arr_view['description']); $description = substr_utf8($description, 0, 50); $arr_rep['miaoshu'] = $description; $arr_rep['base_url'] = $base; $arr_rep['body'] = $arr_view['body'] . "<br>" . "您可能感兴趣:<br>" . $arr_view['x_link']; if ($arr_rep['body'] == '') { //如果尚未生成文章内容 $arr_rep['body'] = '暂无数据,请等待。错误编码:' . md5($base_url); } $arr_rep['adkey'] = $arr['adkey']; $arr_rep['adkey_code'] = urlencode($arr['adkey']); $arr_rep['adkey_iconv'] = urlencode(iconv("UTF-8", "GBK//IGNORE", $arr['adkey'])); $arr_rep['pid'] = $cfg_tk_pid; $arr_rep['keys_link'] = $key_link; $arr_rep['tag_link'] = $tag_link; /* * 替换 */ foreach ($arr_rep as $k => $v) { $con = str_replace('{' . $k . '}', $v, $con); } echo $con; include './fun/robot.php'; }
function truncate($text, $length = 200, $ending = '...', $exact = true, $considerHtml = true) { if ($considerHtml) { // if the plain text is shorter than the maximum length, return the whole text if (strlen_utf8(preg_replace('/<.*?>/u', '', $text)) <= $length) { return $text; } // splits all html-tags to scanable lines preg_match_all('/(<.+?>)?([^<>]*)/su', $text, $lines, PREG_SET_ORDER); $total_length = strlen_utf8($ending); $open_tags = array(); $truncate = ''; foreach ($lines as $line_matchings) { // if there is any html-tag in this line, handle it and add it (uncounted) to the output if (!empty($line_matchings[1])) { // if it's an "empty element" with or without xhtml-conform closing slash (f.e. <br/>) if (preg_match('/^<(\\s*.+?\\/\\s*|\\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param)(\\s.+?)?)>$/isu', $line_matchings[1])) { // do nothing } else { if (preg_match('/^<\\s*\\/([^\\s]+?)\\s*>$/us', $line_matchings[1], $tag_matchings)) { // delete tag from $open_tags list $pos = array_search($tag_matchings[1], $open_tags); if ($pos !== false) { unset($open_tags[$pos]); } } else { if (preg_match('/^<\\s*([^\\s>!]+).*?>$/su', $line_matchings[1], $tag_matchings)) { // add tag to the beginning of $open_tags list array_unshift($open_tags, strtolower($tag_matchings[1])); } } } // add html-tag to $truncate'd text $truncate .= $line_matchings[1]; } // calculate the length of the plain text part of the line; handle entities as one character $content_length = strlen_utf8(preg_replace('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/ui', ' ', $line_matchings[2])); if ($total_length + $content_length > $length) { // the number of characters which are left $left = $length - $total_length; $entities_length = 0; // search for html entities if (preg_match_all('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/iu', $line_matchings[2], $entities, PREG_OFFSET_CAPTURE)) { // calculate the real length of all entities in the legal range foreach ($entities[0] as $entity) { if ($entity[1] + 1 - $entities_length <= $left) { $left--; $entities_length += strlen_utf8($entity[0]); } else { // no more characters left break; } } } $truncate .= substr_utf8($line_matchings[2], 0, $left + $entities_length); // maximum lenght is reached, so get off the loop break; } else { $truncate .= $line_matchings[2]; $total_length += $content_length; } // if the maximum length is reached, get off the loop if ($total_length >= $length) { break; } } } else { if (strlen_utf8($text) <= $length) { return $text; } else { $truncate = substr_utf8($text, 0, $length - strlen_utf8($ending)); } } // if the words shouldn't be cut in the middle... if (!$exact) { // ...search the last occurance of a space... $spacepos = strrpos($truncate, ' '); if (isset($spacepos)) { // ...and cut the text in this position $truncate = substr_utf8($truncate, 0, $spacepos); } } // add the defined ending to the text $truncate .= $ending; if ($considerHtml) { // close all unclosed html-tags foreach ($open_tags as $tag) { $truncate .= '</' . $tag . '>'; } } return $truncate; }
/** * 抓取文章 */ function CrawlView() { if (count($this->listNotCrawl) < 1) { //全部抓取完毕,不再抓取 //$this->isUp=FALSE; return; } if ($this->errorTime > 6) { //错误超过6次,跳过 array_splice($this->listNotCrawl, 0, 1); $this->errorTime = 0; $this->isUp = TRUE; return; } $articleId = $this->listNotCrawl[0]; $data = CurlGet('http://zhidao.baidu.com/question/' . $articleId . '.html'); $data = iconv("GBK", "UTF-8", $data); //标题 $title = ''; if (preg_match('%<title>([^>]+?)_%sim', $data, $arr)) { $title = strip_tags($arr[1]); } if ($title == '') { $this->isUp = TRUE; $this->errorTime += 1; return; } /*描述 $question_description=''; if(preg_match('%<pre id="question-content">([^<]+)</pre>%sim',$data,$arr)){ $question_description=trim($arr[1]); }*/ //最佳答案 $answers = ''; if (preg_match('%<pre id="best-answer-content[^>]+?>([^<]+?)</pre>%sim', $data, $arr)) { $answers[] = str_replace(array('\\r', '\\n', '\\r\\n'), '<br>', strip_tags(trim($arr[1]))); } //其他答案 if (preg_match_all('%<pre class="reply[^>]+?>([^<]+?)</pre>%sim', $data, $arr)) { foreach ($arr[1] as $r) { $answers[] = str_replace(array('\\r', '\\n', '\\r\\n'), '<br>', strip_tags(trim($r))); } } $data = null; $arr = null; unset($data, $arr); $arrBody = explode('<br>', $this->body); $arrBody = array_merge($arrBody, $answers); if (count($arrBody) <= 1) { $body = $arrBody[0]; } else { shuffle($arrBody); $body = implode('<br>', $arrBody); } $body = str_replace($title, '', $body); $arrBody = explode('<br>', $body); shuffle($arrBody); $body = implode('<br>', $arrBody); if ($this->title == '') { $this->title = $title; } $this->errorTime = 0; if (strlen($body) > 20000) { //body达到20K,不再抓取。 $body = substr_utf8($body, 0, 20000); $this->body = $body; $body = null; $this->listNotCrawl = array(); $this->isUp = TRUE; return; } $this->body = $body; $body = null; array_splice($this->listNotCrawl, 0, 1); //去掉成功的文章 $this->isUp = TRUE; }
function randart($key, $keys, $a_no, $base_url) { global $cfg_xs_url; $url = 'http://' . $base_url . 'read-' . $a_no . 'html'; $md5_url = md5($url); /* * 生成指纹序列 * $art_sp:文章指纹序列 * $choas_sp:乱序指纹序列 */ for ($i = 0; $i < 5; $i++) { //标题指纹序列 $art_sp[$i] = toTen(substr($md5_url, $i * 4, 4)) + 1; } $choas_sp = str_split(toTen(substr($md5_url, 0, 8)) . toTen(substr($md5_url, 8, 8)) . toTen(substr($md5_url, 16, 8)) . toTen(substr($md5_url, 24, 8)), 1); /* * 副关键词 */ $arr_keys = explode(",", $keys); $arr_keys = arr_choas($arr_keys, $choas_sp); $key2 = $arr_keys[0]; /* * 确定标题 * 先从5个文本组合副关键词组合打乱一次 * 再提取20字符,加入主关键词打乱一次 */ $title = ''; $des = ''; for ($i = 0; $i < 5; $i++) { $temp = file_get_contents($cfg_xs_url . $art_sp[$i] . ".txt"); $title .= $temp . '|' . $key2; $des .= $temp; } $arr_title = explode("|", $title); $arr_title = arr_choas($arr_title, $choas_sp); //按指纹乱序标题 $title = implode('|', $arr_title); $title = preg_replace("%<[^>]+>%", ',', $title); //防止网址中字母被匹配 $title = str_replace(array(',', '。', ':', '”', '“', '!', '……', '?', ';', ' ', ','), '', $title); $title = replace_db($title, ''); $title = substr_utf8($title, 0, 20); $arr_title = explode("|", $title . '|' . $key); $arr_title = arr_choas($arr_title, $choas_sp); $title = implode('', $arr_title); /* * 确定描述 */ if (rand(0, 10) > 7) { $arr_des = explode('|', $des . '|' . $key2 . '|' . $key); } else { $arr_des = explode('|', $des . '|' . $key); } shuffle($arr_des); $des = implode('', $arr_des); $des = replace_dbs($des, '.'); $des = substr_utf8($des, 0, 500); /* * 返回 * $arr_res[] */ $arr_res = array(); $arr_res['title'] = $title; $arr_res['des'] = $des; return $arr_res; }
function insertart($key, $keys, $a_no, $base_url) { global $cfg_xs_url; $url = 'http://' . $base_url . 'reads-' . $a_no . 'html'; $md5_url = md5($url); /* * 生成指纹序列 * $body_sp:正文指纹序列 * $choas_sp:乱序指纹序列 */ for ($i = 0; $i < 7; $i++) { //标题取第1个,文章28个指纹序列 $body_sp[$i] = toTen(substr($md5_url, $i * 4, 4)) + 1; $body_sp[$i + 7] = toTen(substr($md5_url, $i * 4 + 1, 4)) + 1; $body_sp[$i + 14] = toTen(substr($md5_url, $i * 4 + 2, 4)) + 1; $body_sp[$i + 21] = toTen(substr($md5_url, $i * 4 + 3, 4)) + 1; } $choas_sp = str_split(toTen(substr($md5_url, 0, 8)) . toTen(substr($md5_url, 8, 8)) . toTen(substr($md5_url, 16, 8)) . toTen(substr($md5_url, 24, 8)), 1); /* * 副关键词 */ $arr_keys = explode(",", $keys); $arr_keys = arr_choas($arr_keys, $choas_sp); $key2 = $arr_keys[0]; /* * 确定标题 * 选取第一个文章。 * 按指纹插入关键词。 */ $title = file_get_contents($cfg_xs_url . $body_sp[0] . ".txt"); $body = $title; $arr_title = explode("|", $title); $arr_key = array($key, $key2); $title = arr_insertkey($arr_title, $choas_sp, $arr_key, 1); $title = preg_replace("%<[^>]+>%", ',', $title); //防止网址中字母被匹配 $title = str_replace(array(',', '。', ':', '”', '“', '!', '……', '?', ';', ' ', ','), '', $title); $title = replace_db($title, ''); $title = substr_utf8($title, 0, 20); /* * 确定描述 * 选取28个文章。 * 按指纹插入关键词。 */ for ($i = 0; $i < 28; $i++) { $temp = file_get_contents($cfg_xs_url . $body_sp[$i] . ".txt"); $body .= '|' . $temp; if ($i % 5 > 3) { $body .= '<br>'; } } $arr_body = explode('|', $body); $arr_key = explode(",", $key . ',' . $keys . ',' . $key . ',' . $key . ',' . $key); $body = arr_insertkey($arr_body, $choas_sp, $arr_key, 10); $body = replace_dbs($body, '.'); $body = str_replace(' ', '', $body); //$body=substr_utf8($body,0,500); /* * 返回 * $arr_res[] */ $arr_res = array(); $arr_res['title'] = $title; $arr_res['body'] = $body; return $arr_res; }
<?php header('Content-type:text/html;charset=utf-8'); function substr_utf8($str, $start, $length = null) { $arr = preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY); var_dump($arr); } $str = '传智播客PHP学院'; substr_utf8($str);
function readart($key, $keys, $a_no, $base_url) { global $cfg_xs_url; $url = 'http://' . $base_url . 'read-' . $a_no . 'html'; $md5_url = md5($url); /* * 生成指纹序列 * $art_sp:文章指纹序列 * $body_sp:正文指纹序列 * $choas_sp:乱序指纹序列 */ for ($i = 0; $i < 7; $i++) { //标题5个,文章28个指纹序列 $body_sp[$i] = toTen(substr($md5_url, $i * 4, 4)) + 1; $body_sp[$i + 7] = toTen(substr($md5_url, $i * 4 + 1, 4)) + 1; $body_sp[$i + 14] = toTen(substr($md5_url, $i * 4 + 2, 4)) + 1; $body_sp[$i + 21] = toTen(substr($md5_url, $i * 4 + 3, 4)) + 1; if ($i < 5) { $art_sp[$i] = $body_sp[$i]; } //标题文章序列 } $choas_sp = str_split(toTen(substr($md5_url, 0, 8)) . toTen(substr($md5_url, 8, 8)) . toTen(substr($md5_url, 16, 8)) . toTen(substr($md5_url, 24, 8)), 1); /* * 副关键词 */ $arr_keys = explode(",", $keys); $arr_keys = arr_choas($arr_keys, $choas_sp); $key2 = $arr_keys[0]; /* * 确定标题 * 先从5个文本组合副关键词组合打乱一次 * 再提取20字符,加入主关键词打乱一次 */ $title = ''; $des = ''; for ($i = 0; $i < 5; $i++) { $temp = file_get_contents($cfg_xs_url . $art_sp[$i] . ".txt"); $title .= $temp . '|' . $key2; } $body = $title; $arr_title = explode("|", $title); $arr_title = arr_choas($arr_title, $choas_sp); //按指纹乱序标题 $title = implode('|', $arr_title); $title = preg_replace("%<[^>]+>%", ',', $title); //防止网址中字母被匹配 $title = str_replace(array(',', '。', ':', '”', '“', '!', '……', '?', ';', ' ', ','), '', $title); $title = replace_db($title, ''); $title = substr_utf8($title, 0, 20); $arr_title = explode("|", $title . '|' . $key); $arr_title = arr_choas($arr_title, $choas_sp); $title = implode('', $arr_title); /* * 确定正文 */ for ($i = 5; $i < 28; $i++) { $temp = file_get_contents($cfg_xs_url . $body_sp[$i] . ".txt"); $body .= '|' . $temp; if ($i < 16) { $body .= '|' . $key . '|' . '。<br>'; } } $arr_body = explode("|", $body); $arr_body = arr_choas($arr_body, $choas_sp); $body = implode('', $arr_body) . '。'; $body = replace_dbs($body, '。'); /* * 返回 * $arr_res[] */ $arr_res = array(); $arr_res['title'] = $title; $arr_res['body'] = $body; return $arr_res; }
{ return join("", array_slice(preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY), $start, $lenth)); } $str = '贾智超敬爱之巢'; echo substr_utf8($str, 2, 1); echo "<br/>-----------------------------------------------------------------------------<br/>"; $str1 = '一二三四五六七吧九十'; echo mb_substr($str1, 2, 3); echo "<br/>-----------------------------------------------------------------------------<br/>"; /** * 求解字符串的字符数, */ $a = '中国2北333京'; echo mb_strlen($a, 'utf8'); //获取字符串长度 echo "<br/>"; echo mb_substr($a, 3, 1, 'utf8'); //截取字符串 /** * 自定义函数实现求其长度 * @param $str String要计算的字符串 * @return int 字符串的长度 */ echo "<br/>"; function strlen_utf8($str) { return count(preg_split('//u', $str, -1, PREG_SPLIT_NO_EMPTY)); } $a = '中国2北343433京'; echo substr_utf8($a, 3, 1);
//生成taglink $tagLink = ''; $arrTagKey = explode(',', $indexCache->tagKey); foreach ($arrTagKey as $v) { $tagEPath = GetEPath($v); $arr = explode('/', $tagEPath); $tagLink .= '<li><a href="http://' . $arr[0] . $indexCache->mainDomain . '/' . $arr[1] . '">' . $v . '</li>'; } //替换数组 $arrReplace = array(); $arrReplace['key'] = $indexCache->key; $arrReplace['keys'] = $indexCache->keys; $arrReplace['title'] = $baidu->title; $arrReplace['maintitle'] = $indexCache->title; $arrReplace['adkey'] = $indexCache->adKey; $arrReplace['keyslink'] = $keysLink; $arrReplace['taglink'] = $tagLink; if ($baidu->body == '') { $arrReplace['body'] = '暂无数据,请等待。错误编码:' . md5($indexCache->baseUrl . $viewID); } else { $arrReplace['body'] = $baidu->body; } $arrReplace['body'] .= '您可能感兴趣:<br>' . $baidu->links; $arrReplace['baseurl'] = $indexCache->baseUrl; $description = preg_replace("%<[^>]+>%", ',', $baidu->description); $description = substr_utf8($description, 0, 50); $arrReplace['description'] = $description; foreach ($arrReplace as $k => $v) { $templets = str_replace('{' . $k . '}', $v, $templets); } echo $templets;
function messagecutstr($str, $length = 0, $dot = ' ...') { $str = messagesafeclear($str); $sppos = strpos($str, chr(0) . chr(0) . chr(0)); if ($sppos !== false) { $str = substr($str, 0, $sppos); } #loadcache(array('bbcodes_display', 'bbcodes', 'smileycodes', 'smilies', 'smileytypes', 'domainwhitelist')); $bbcodes = 'b|i|u|p|color|size|font|align|list|indent|float'; $bbcodesclear = 'email|code|free|table|tr|td|img|swf|flash|attach|media|audio|groupid|payto'; $str = preg_replace("/\\[i=?.*?\\](.*?)\\[\\/i\\]/is", '', $str); $str = strip_tags(preg_replace(array("/\\[hide=?\\d*\\](.*?)\\[\\/hide\\]/is", "/\\[quote](.*?)\\[\\/quote]/si", "/\\[url=?.*?\\](.+?)\\[\\/url\\]/si", "/\\[({$bbcodesclear})=?.*?\\](.*?)\\[\\/\\1\\]/si", "/\\[({$bbcodes})=?.*?\\]/i", "/\\[\\/({$bbcodes})\\]/i"), array('', '', '\\1', '', '', ''), $str)); if ($length) { // $str = cutstr($str, $length, $dot); $str = substr_utf8($str, 0, $length); } #$str = preg_replace($_G['cache']['smilies']['searcharray'], '', $str); return trim($str); }