$str = 'hello word2 haha word1 word4 word2'; $arrRet = trie_filter_search_all($resTrie, $str); print_all($str, $arrRet); echo "\ntest2///////////////////\n"; $str = 'hello word'; $arrRet = trie_filter_search($resTrie, $str); print_all($str, array($arrRet)); //Array() $arrRet = trie_filter_search_all($resTrie, 'hello word'); print_all($str, $arrRet); echo "\ntest3///////////////////\n"; echo "start memory=" . memory_get_usage(true) . "\n"; date_default_timezone_set('Asia/Chongqing'); $test = array('a', 'abd', 'dad', 'pab', 'dda', 'word1f', 'cword1', 'cword1t'); foreach ($test as $v) { // echo "per start memory=".memory_get_usage(true)."\n"; $arrRet = trie_filter_search_all($resTrie, $v); // echo "per end memory=".memory_get_usage(true)."\n"; //print_all($v, $arrRet); } echo "end memory=" . memory_get_usage(true) . "\n"; echo date('Y-m-d H:i:s'); trie_filter_free($resTrie); function print_all($str, $res) { //print_r($res); echo "{$str}\n"; foreach ($res as $k => $v) { echo $k . "=>{$v[0]}-{$v[1]}-" . substr($str, $v[0], $v[1]) . "\n"; } }
ini_set('display_errors', 1); error_reporting(E_ALL ^ E_NOTICE); $arrWord = array('Key', 'w ord', '12345678'); $resTrie = trie_filter_new(); echo "\nsave key\n"; foreach ($arrWord as $k => $v) { echo "id:", $k, ' -> ', $v, "\n"; trie_filter_store($resTrie, $k, $v, TRIE_FILTER_UP | TRIE_FILTER_SP | TRIE_FILTER_NUM); } //去掉关键字中的空格 小写转为大写 数字转为0 //实际相当于 array('WORD', 'KEY', '00000000'); trie_filter_save($resTrie, __DIR__ . '/blackword.tree'); $resTrie = trie_filter_load(__DIR__ . '/blackword.tree'); $strContent = 'hello wo rd WORD kEy 0123456789'; $arrRet = trie_filter_search_all($resTrie, $strContent, TRIE_FILTER_UP | TRIE_FILTER_SP | TRIE_FILTER_NUM); //小写转为大写 数字转为0 忽略文本中的空格 //实际相当于 'HELLOWORDWORDKEY0000000000' print_all($strContent, $arrRet); function print_all($str, $res) { echo "\ntext:{$str}\n", "\nmatch ", count($res) / 3, "\n"; for ($i = 0, $c = count($res); $i < $c; $i += 3) { echo 'id:', $res[$i + 2], ' -> ', substr($str, $res[$i], $res[$i + 1]), "\n"; } } trie_filter_free($resTrie); /*输出为 save key id:0 -> Key
/** * 替换字符串中首次匹配到的敏感词 * * @param $str * @return string * @throws Exception */ protected function oneReplace($str) { if (empty($str) || !is_string($str)) { return ''; } if (!$this->isEnable) { return $str; } if (!is_resource($this->trie)) { throw new Exception('Trie Error'); } // 搜索敏感词 $arrRet = trie_filter_search_all($this->trie, $str); // 没有敏感词,直接返回原句 if (empty($arrRet)) { return $str; } // 用于存储替换后的字符串 $newStr = ''; // 记录上一个敏感词结尾的位置,初始为0 $end = 0; foreach ($arrRet as $key => $value) { $start = $value[0]; $length = $value[1]; $word = substr($str, $start, $length); if (self::isSubstitute($word)) { continue; } /** * 如果敏感词是英文,就需要特殊判断 * 因为匹配出来的英文敏感词可能只是某个单词的一部分 * 例如:假设 ab 是敏感词,可能 ab 只是在单词 abstract 中的一部分内容,这种情况 ab 应该不算作敏感词,不应该被替换 */ if (self::isAlpha($word)) { $beforeChr = mb_substr(substr($str, 0, $start), -1, 1, 'UTF-8'); $afterChr = mb_substr(substr($str, $start + $length), 0, 1, 'UTF-8'); // 如果英文敏感词的前一个字符或后一个字符也是英文的话,就不替换 if (self::isAlpha($beforeChr) || self::isAlpha($afterChr)) { continue; } } // 拼接敏感词前面的内容 $newStr .= substr($str, $end, $start - $end); // 记录敏感词结尾的位置 $end = $start + $length; // 拼接*号(敏感词替换为了*) $newStr .= str_repeat('*', mb_strlen($word, 'UTF-8')); // 匹配一次即可,跳出循环 break; } $newStr .= substr($str, $end); return $newStr; }