Beispiel #1
0
 function isWord($word, $flag = false, $fuzzy = 0.7)
 {
     // simple caching
     $encoding = 'UTF-8';
     $words = preg_split('/\\s+/', $word);
     $cword = implode($words);
     if (array_key_exists($cword, $this->_cache)) {
         return $this->_cache[$cword];
     }
     $word = array_pop($words);
     list($l, $min_seek, $max_seek, $scount) = _fuzzy_bsearch_file($this->_dict, $word, 0, $this->_dict_size / 2, 0, $this->_dict_size);
     list($c, $buf, $last) = _file_match($this->_dict, $word, $min_seek, $max_seek, $this->_dict_size, 0, $flag, $encoding);
     $cand = array();
     if (!empty($c)) {
         $pre = '';
         if (!empty($words)) {
             $pre = implode('-', $words) . '-';
         }
         $list = explode("\n", rtrim($buf));
         foreach ($list as $l) {
             list($k, $t, $r) = explode(':', $l);
             $cand[] = array($pre . $k, $this->tagName($t), $r);
         }
         #print_r($cand);
     } else {
         if (!empty($last)) {
             // similar match found
             $list = rtrim($buf);
             list($k, $t, $r) = explode(':', $buf);
             // XXX get shortest match word
             #print($buf);
             $type = $this->tagName($t);
             $pre = $k;
             $pl = strlen(utf8_decode($pre));
             if ($pl == 1 and strlen($word) > 1) {
                 // not found
                 // split word to prefix + new word
                 // 5 => 2 + 3, 3 + 2 / 1 + 4
                 // 4 => 2 + 2 / 1 + 3
                 // 6 => 2 + 4, 3 + 3 / 1 + 5
                 #$nword = substr($word, strlen($pre)); // 1-char + new word
                 #$words[] = $pre;
                 #$pre = implode('-',$words);
                 #$nret = $this->isWord($pre.' '.$nword);
                 #if (!empty($nret[0])) {
                 #    #print_r($nret[1][0]);
                 #    return $nret;
                 #} else {
                 #    array_pop($words);
                 #    $pre = mb_substr($word,0,2,$encoding);
                 #}
                 $pre = mb_substr($word, 0, 2, $encoding);
             }
             $nword = substr($word, strlen($pre));
             // next word
             $words[] = $pre;
             $pre = implode('-', $words);
             if (!empty($nword)) {
                 if ($type[0] == 'n') {
                     // noun
                     $stem = $this->getNoun($nword, $match);
                     #print '*** stem'.$nword.'=='.$stem."\n";
                     if (!empty($stem)) {
                         return $this->isWord($pre . ' ' . $stem);
                     }
                 } else {
                     // not noun
                     $stem = $this->getNoun($nword, $match);
                     #print '*** stem'.$nword.'=='.$stem."\n";
                     if (!empty($stem)) {
                         return $this->isWord($pre . ' ' . $stem);
                     }
                 }
             }
         }
     }
     $ret = array($c, $cand, $last);
     $this->_cache[$word] =& $ret;
     return $ret;
 }
Beispiel #2
0
function do_textdict($formatter, $options)
{
    global $Config;
    $_debug = $options['debug'] ? $options['debug'] : 0;
    $formatter->send_header('', $options);
    $formatter->send_title('', '', $options);
    if ($options['value']) {
        $value = $options['value'];
    } else {
        $value = $formatter->page->get_raw_body($options);
    }
    $delims = ",.\\|\n\r\\s\\(\\)\\[\\]{}!@#\$%\\^&\\*\\-_\\+=~`';:'\"\\?<>\\/";
    # un-wikify CamelCase, change "WikiName" to "Wiki Name"
    $value = preg_replace("/((?<=[a-z0-9]|[B-Z]{2})([A-Z][a-z]))/", " \\1", $value);
    # separate alphanumeric and local characters
    $value = preg_replace("/((?<=[a-z0-9])([^a-z0-9]+))/i", " \\1", $value);
    $keys = preg_split("/[{$delims}]+/", $value);
    # must be longer than $more_specific_len.
    if ($more_specific_len > 0) {
        for ($i = 0, $s = sizeof($keys); $i < $s; $i++) {
            if (strlen($keys[$i]) <= $more_specific_len) {
                unset($keys[$i]);
            }
        }
    }
    sort($keys);
    $keys = array_unique($keys);
    $fp = fopen(TEXT_DICT, 'r');
    if (!is_resource($fp)) {
        return '';
    }
    $fs = fstat($fp);
    $fz = $fs['size'];
    if ($_debug) {
        $options['timer']->Check("read");
    }
    foreach ($keys as $i => $key) {
        list($l, $min_seek, $max_seek, $scount) = _fuzzy_bsearch_file($fp, $key, 0, $fz / 3, 0, $fz);
        if ($_debug) {
            $options['timer']->Check("seek");
        }
        list($c, $buf, $last) = _file_match($fp, $key, $min_seek, $max_seek, $fz, 0, true, 'UTF-8');
        if ($_debug) {
            $options['timer']->Check("find");
            print 'found=' . $c . "<br />\n";
            print 'scount=' . $scount . "<br />\n";
            if ($last) {
                print 'last=' . $last . "<br />\n";
            }
            if ($_debug > 50) {
                if (!empty($buf)) {
                    print $buf . "<br />\n";
                }
            }
            $options['timer']->Check("log");
        }
    }
    fclose($fp);
    if ($_debug) {
        print "total " . sizeof($keys) . " words searched<br />\n";
        $options['timer']->Check("dict");
        print "<pre>";
        print $options['timer']->Write();
        print "</pre>";
    }
    $formatter->send_footer('', $options);
    return;
}