/
String.php
333 lines (310 loc) · 11.9 KB
/
String.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
<?php
/**
* 字符串处理
*
* @package helper
* @author
*/
class Helper_String {
//禁止实例化
final protected function __construct() {
throw new Exception_System(200402, "此类禁止实例化", '');
}
/**
* 按照字长截取多字节字符串,并进行htmlspecialchars处理
* @param string $string
* @param <type> $length
* @param <type> $sublen
* @param <type> $dot
* @param <type> $htmlspecialchars
* @param <type> $encoding
* @return <type>
*/
static public function cnSubStr($string, $length, $sublen, $dot = '...', $htmlspecialchars = false, $encoding = 'utf-8') {
if (mb_strlen($string, $encoding) > $length) {
$string = mb_substr($string, 0, $sublen, $encoding) . $dot;
}
return $htmlspecialchars ? htmlspecialchars($string, ENT_QUOTES) : $string;
}
/**
* 截取多字节字符串,并进行htmlspecialchars处理
* @param string $string
* @param int $length
* @param string $dot
* @param int $htmlspecialchars
* @return string
*/
static public function cnCut($string, $length, $dot = '...', $htmlspecialchars = true) {
if (mb_strwidth($string) > $length) {
$string = mb_strimwidth($string, 0, $length, $dot);
}
return $htmlspecialchars ? htmlspecialchars($string, ENT_QUOTES) : $string;
}
/**
* 按显示字符的宽度截取已经htmlspecialchars后的字符串
* @param type $string
* @param type $length
* @param type $dot
* @return type
*/
static public function cnCutEncoded($string, $length, $dot = '...') {
if (mb_strwidth($string) <= $length) {
return $string;
}
$string = htmlspecialchars_decode($string, ENT_QUOTES);
$string = mb_strimwidth($string, 0, $length, $dot);
return htmlspecialchars($string, ENT_QUOTES);
}
/**
* 对传入的内容标红处理,可能存在多个关键字需要标红则循环处理
*
* @param string $content 内容
* @param string $searKey 标红的对象
* @return string
*/
static public function redTag($content, $sear_key) {
if (in_array($sear_key, array('~', '/'))) {
$sear_key_arr = array($sear_key);
} else {
$sear_key_arr = array_unique(preg_split("/[\s|\/|~]+/", $sear_key));
}
if (count($sear_key_arr) > 0) {
foreach ($sear_key_arr as $v) {
$vv = $v;
if ($vv === true)
continue;
//过滤转义特殊字符
$v_word = array('+', '.', '?', '$', '^', '*', '(', ')', '[', ']', '{', '}');
foreach ($v_word as $vm) {
$vv = str_replace($vm, "\\{$vm}", $vv);
}
$content = self::dealRedTag($content, $vv);
}
}
return $content;
}
/**
* 标红处理
*
* @param string $content 内容
* @param string $searKey 标红的对象
* @return string
*/
static private function dealRedTag($content, $preg_key) {
if (empty($content) || empty($preg_key)) {
return $content;
}
$html_tags = array();
preg_match_all("/<(\S*?)[^>]*>.*?<\/\\1>|<[^>]+>|<sina:link[^>]*>/i", $content, $tmps);
foreach ($tmps[0] as $k => $v) {
array_push($html_tags, array('sTag' => "#tag{$k}#", 'oTag' => $v));
}
if (count($html_tags) > 0) {
foreach ($html_tags as $ht) {
$content = str_replace($ht['oTag'], $ht['sTag'], $content);
}
}
$content = preg_replace("/($preg_key)/i", "<span style='color: red;'>\\1</span>", $content);
$content = str_replace("#", "#", $content);
$content = preg_replace("/#([^#]+)#/ies", "strip_tags('#\\1#')", $content);
if (count($html_tags) > 0) {
foreach ($html_tags as $ht) {
$content = str_replace($ht['sTag'], $ht['oTag'], $content);
}
}
return $content;
}
/**
* 字符串截取方法
* 支持中英文、全角半角、
*
* @param string $string 要截取的字符串
* @param int $length 截取长度
* @param string $dot 截取后,显示符
* @param string $charset 字符编码
* @return string
*/
public static function truncate($string, $length=50, $dot = '...', $charset='utf-8'){
$strlen = strlen($string);
//$strlen = mb_strwidth($string, $charset);
if($strlen <= $length) return $string;
$string = str_replace(array(' ', '&', '"', ''', '“', '”', '—', '<', '>', '·', '…'), array(' ', '&', '"', "'", '“', '”', '—', '<', '>', '·', '…'), $string);
$strcut = '';
if(strtolower($charset) == 'utf-8'){
$n = $tn = $noc = 0;
while($n < $strlen){
$t = ord($string[$n]);
if($t == 9 || $t == 10 || (32 <= $t && $t <= 126)) {
$tn = 1; $n++; $noc++;
} elseif(194 <= $t && $t <= 223) {
$tn = 2; $n += 2; $noc += 2;
} elseif(224 <= $t && $t <= 239) {
$tn = 3; $n += 3; $noc += 2;
} elseif(240 <= $t && $t <= 247) {
$tn = 4; $n += 4; $noc += 2;
} elseif(248 <= $t && $t <= 251) {
$tn = 5; $n += 5; $noc += 2;
} elseif($t == 252 || $t == 253) {
$tn = 6; $n += 6; $noc += 2;
} else {
$n++;
}
if($noc >= $length) break;
}
if($noc > $length) $n -= $tn;
$strcut = substr($string, 0, $n);
//$strcut = mb_strimwidth($string, 0, $n, "", $charset);
}
else{
$dotlen = strlen($dot);
$maxi = $length - $dotlen - 1;
for($i = 0; $i < $maxi; $i++){
$strcut .= ord($string[$i]) > 127 ? $string[$i].$string[++$i] : $string[$i];
}
}
$strcut = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $strcut);
return $strcut.$dot;
}
/**
* HTML标签闭合
*
* @param string $html 输入HTML
* @param boolean $suto_span 纯JS时是否自动套空SPAN,避免SCRIPT被过滤的问题
*
* @return string 处理后的HTML
*/
static public function tidy($html, $auto_span=true) {
if(!extension_loaded('tidy')) {
if(function_exists('dl')) {
Helper_Debug::error('Tidy is not loaded, try to dl.');
dl('tidy.so');
} else {
throw new Exception_System(200107, 'PHP extension tidy not loaded.', array());
}
}
$html = trim($html);
if($auto_span && stripos($html, '<script') === 0 && stripos($html, '</script>') === (strlen($html)-9)) {
$html = '<span>'.$html.'</span>';
}
return tidy_repair_string($html, array('output-xhtml' => true, 'show-body-only' => true, 'input-encoding' => 'utf8', 'output-encoding' => 'utf8', 'char-encoding' => 'utf8'), 'utf8');
}
/**
* 去掉最右边的URL
* @param string $text
* @param string $link_prefix 链接前缀(支持正则表过式)
* @return type
*/
static public function cutRightUrl($text, $link_prefix = '详情:') {
return preg_replace("~\s*{$link_prefix}https?://[a-z0-9A-Z/?.]+$~", '', $text);
}
/**
* 检查字符串中是否有空白
* @param type $text
* @return boolean
*/
static public function hasGap($text) {
return (boolean)preg_match('/\s/', $text);
}
/**
* 判断一个字符串是否包含4个字节的utf8字符
*
* @param string $text 要检测的字符串
* @return int 1.包含;0.不包含
*/
static public function isUtf8mb4($text) {
return preg_match('/[\xF0-\xFF]/', $text);
}
/**
* 中文分词
*
* @param string $content
* @param enum $mode 分词的规则,enum(SCWS_MULTI_SHORT, SCWS_MULTI_DUALITY, SCWS_MULTI_ZMAIN, SCWS_MULTI_ZALL)
* 分别表示短语,二元,主要单字,所有单字
*
* @return array $splits 包含被分的词的数组
*/
public static function cnSplit($content, $mode=SCWS_MULTI_ZALL) {
if (empty($content) || !function_exists('scws_new')) {
return $content;
}
$ws_obj = scws_new();
$ws_obj->set_charset('utf8');
$ws_obj->send_text($content);
$ws_obj->set_multi($mode);
$splits = array();
while($tmp=$ws_obj->get_result()) {
foreach ($tmp as $each) {
$splits[] = $each['word'];
}
}
$ws_obj->close();
return $splits;
}
/**
* 过滤ascii码的所有特殊字符
*
* @param string $str
* @param string $replace 过滤掉的ascii码的替代字符
*
* @return string $rst
*/
public static function escapeAsciiSpecAll($str, $replace='') {
return preg_replace("/([\\x00-\\x1f\\x21-\\x2f\\x3a-\\x40\\x5b-\\x60\\x7b-\\x7f]+)/i", $replace, $str);
}
/**
* 转化全角空格为半角,\n,\r为空格,合并多个空格为1一个,把全角@转成半角@
* 这个是发微博的需求
*
* @param string $str
*
* @return string
*/
public static function trimAllBlank($str) {
$str = str_replace(array(" ", "\n", "\r"), " ", $str);
$str = preg_replace("/[ ]{1,}/", " ", $str);
$str = str_replace('@', '@', $str);
return $str;
}
/**
* 计算字符串的长度 [yanzhong@20130116]
* [注] 常用于DB字段长度判断,如:varchar(255),代表存255个字符,可为255汉字,可为255英文,也存在混杂情况。
*
* @param string $string string
* @return int ($zhStr = '您好,中国!', $len = 6; $str = 'Hello,中国!'; $len=9)
*/
public static function utf8_strlen($string = null) {
// 将字符串分解为单元
preg_match_all('/./us', $string, $match);
// 返回单元个数
return count($match[0]);
}
/**
*
*
* @param type $content
* @return type
*/
public static function checkStatusLength($content, $max = 140) {
$min = 41;
$surl = 20;
$urlCount = 0;
preg_match_all("/http:\/\/[a-zA-Z0-9]+(\.[a-zA-Z0-9]+)+([-A-Z0-9a-z_\$\.\+\!\*\(\)\/,:;@&=\?\~\#\%]*)*/i", $content, $urls);
foreach ($urls[0] as $url) {
$count = mb_strwidth($url, 'UTF-8');
if (preg_match('/^(http:\/\/t.cn)/', $url)) {
continue;
}else {
if (preg_match('/^(http:\/\/)+(t.sina.com.cn|t.sina.cn)/', $url) || preg_match('/^(http:\/\/)+(weibo.com|weibo.cn)/', $url)) {
//本域的小于41字符按照实际值算,大于41小于等于140以20字符算,超过140为20加溢出长度
$urlCount += $count <= $min ? $count : ($count <= $max ? $surl : ($count - $max + $surl));
}else {
//非本域的小于140的按照20算,超过140为20加溢出长度
$urlCount += $count <= $max ? $surl : ($count - $max + $surl);
}
}
$content = str_replace($url, "", $content);
}
$result = ceil(($urlCount + mb_strwidth($content, 'UTF-8')) / 2);
return $result;
}
}