function isWord($word, $flag = false, $fuzzy = 0.7) { // simple caching $encoding = 'UTF-8'; $words = preg_split('/\\s+/', $word); $cword = implode($words); if (array_key_exists($cword, $this->_cache)) { return $this->_cache[$cword]; } $word = array_pop($words); list($l, $min_seek, $max_seek, $scount) = _fuzzy_bsearch_file($this->_dict, $word, 0, $this->_dict_size / 2, 0, $this->_dict_size); list($c, $buf, $last) = _file_match($this->_dict, $word, $min_seek, $max_seek, $this->_dict_size, 0, $flag, $encoding); $cand = array(); if (!empty($c)) { $pre = ''; if (!empty($words)) { $pre = implode('-', $words) . '-'; } $list = explode("\n", rtrim($buf)); foreach ($list as $l) { list($k, $t, $r) = explode(':', $l); $cand[] = array($pre . $k, $this->tagName($t), $r); } #print_r($cand); } else { if (!empty($last)) { // similar match found $list = rtrim($buf); list($k, $t, $r) = explode(':', $buf); // XXX get shortest match word #print($buf); $type = $this->tagName($t); $pre = $k; $pl = strlen(utf8_decode($pre)); if ($pl == 1 and strlen($word) > 1) { // not found // split word to prefix + new word // 5 => 2 + 3, 3 + 2 / 1 + 4 // 4 => 2 + 2 / 1 + 3 // 6 => 2 + 4, 3 + 3 / 1 + 5 #$nword = substr($word, strlen($pre)); // 1-char + new word #$words[] = $pre; #$pre = implode('-',$words); #$nret = $this->isWord($pre.' '.$nword); #if (!empty($nret[0])) { # #print_r($nret[1][0]); # return $nret; #} else { # array_pop($words); # $pre = mb_substr($word,0,2,$encoding); #} $pre = mb_substr($word, 0, 2, $encoding); } $nword = substr($word, strlen($pre)); // next word $words[] = $pre; $pre = implode('-', $words); if (!empty($nword)) { if ($type[0] == 'n') { // noun $stem = $this->getNoun($nword, $match); #print '*** stem'.$nword.'=='.$stem."\n"; if (!empty($stem)) { return $this->isWord($pre . ' ' . $stem); } } else { // not noun $stem = $this->getNoun($nword, $match); #print '*** stem'.$nword.'=='.$stem."\n"; if (!empty($stem)) { return $this->isWord($pre . ' ' . $stem); } } } } } $ret = array($c, $cand, $last); $this->_cache[$word] =& $ret; return $ret; }
function do_textdict($formatter, $options) { global $Config; $_debug = $options['debug'] ? $options['debug'] : 0; $formatter->send_header('', $options); $formatter->send_title('', '', $options); if ($options['value']) { $value = $options['value']; } else { $value = $formatter->page->get_raw_body($options); } $delims = ",.\\|\n\r\\s\\(\\)\\[\\]{}!@#\$%\\^&\\*\\-_\\+=~`';:'\"\\?<>\\/"; # un-wikify CamelCase, change "WikiName" to "Wiki Name" $value = preg_replace("/((?<=[a-z0-9]|[B-Z]{2})([A-Z][a-z]))/", " \\1", $value); # separate alphanumeric and local characters $value = preg_replace("/((?<=[a-z0-9])([^a-z0-9]+))/i", " \\1", $value); $keys = preg_split("/[{$delims}]+/", $value); # must be longer than $more_specific_len. if ($more_specific_len > 0) { for ($i = 0, $s = sizeof($keys); $i < $s; $i++) { if (strlen($keys[$i]) <= $more_specific_len) { unset($keys[$i]); } } } sort($keys); $keys = array_unique($keys); $fp = fopen(TEXT_DICT, 'r'); if (!is_resource($fp)) { return ''; } $fs = fstat($fp); $fz = $fs['size']; if ($_debug) { $options['timer']->Check("read"); } foreach ($keys as $i => $key) { list($l, $min_seek, $max_seek, $scount) = _fuzzy_bsearch_file($fp, $key, 0, $fz / 3, 0, $fz); if ($_debug) { $options['timer']->Check("seek"); } list($c, $buf, $last) = _file_match($fp, $key, $min_seek, $max_seek, $fz, 0, true, 'UTF-8'); if ($_debug) { $options['timer']->Check("find"); print 'found=' . $c . "<br />\n"; print 'scount=' . $scount . "<br />\n"; if ($last) { print 'last=' . $last . "<br />\n"; } if ($_debug > 50) { if (!empty($buf)) { print $buf . "<br />\n"; } } $options['timer']->Check("log"); } } fclose($fp); if ($_debug) { print "total " . sizeof($keys) . " words searched<br />\n"; $options['timer']->Check("dict"); print "<pre>"; print $options['timer']->Write(); print "</pre>"; } $formatter->send_footer('', $options); return; }