Beispiel #1
0
function filter_indexer_ko($formatter, $value, &$options)
{
    $more_specific_len = 1;
    $indexer = new KoreanStemmer();
    if ($options['value']) {
        $value = $options['value'];
    }
    $delims = ",.\\|\n\r\\s\\(\\)\\[\\]{}!@#\$%\\^&\\*\\-_\\+=~`';:'\"\\?<>\\/";
    # un-wikify CamelCase, change "WikiName" to "Wiki Name"
    $value = preg_replace("/((?<=[a-z0-9]|[B-Z]{2})([A-Z][a-z]))/", " \\1", $value);
    # separate alphanumeric and local characters
    $value = preg_replace("/((?<=[a-z0-9])([^a-z0-9]+))/i", " \\1", $value);
    $keys = preg_split("/[{$delims}]+/", $value);
    # must be longer than $more_specific_len.
    if ($more_specific_len > 0) {
        for ($i = 0, $s = sizeof($keys); $i < $s; $i++) {
            if (strlen($keys[$i]) <= $more_specific_len) {
                unset($keys[$i]);
            }
        }
    }
    sort($keys);
    $keys = array_unique($keys);
    $log = '';
    $tag = array('+', '-');
    foreach ($keys as $i => $key) {
        $match = null;
        if ($stem = $indexer->getStem(trim($key), $match, $type)) {
            $log .= $key . '=>' . $stem . $tag[$type - 1] . '/' . $match[1] . "\n";
            if ($type == 1) {
                $keys[$i] = $stem;
            } else {
                unset($keys[$i]);
            }
        } else {
            $log .= '=' . $keys[$i] . "\n";
            $keys[$i] = $keys[$i];
        }
    }
    if ($options['debug']) {
        $options['timer']->Check("indexer");
        return $log . "\n" . $options['timer']->Write();
    }
    return implode("\n", array_unique($keys));
}
Beispiel #2
0
function macro_Keywords($formatter, $value, $options = array())
{
    global $DBInfo;
    $supported_lang = array('ko');
    $limit = isset($options['limit']) ? $options['limit'] : 40;
    $opts = explode(',', $value);
    $sort = '';
    foreach ($opts as $opt) {
        $opt = trim($opt);
        if ($opt == 'delicious' or $opt == 'del.icio.us') {
            $tag_link = 'http://del.icio.us/tag/$TAG';
        } else {
            if ($opt == 'technorati') {
                $tag_link = 'http://www.technorati.com/tag/$TAG';
            } else {
                if ($opt == 'flickr') {
                    $tag_link = 'http://www.flickr.com/photos/tags/$TAG';
                } else {
                    if ($opt == 'all') {
                        $options['all'] = 1;
                        $limit = 0;
                    } else {
                        if ($opt == 'random') {
                            $options['random'] = $options['all'] = 1;
                        } else {
                            if ($opt == 'suggest') {
                                $options['suggest'] = 1;
                            } else {
                                if ($opt == 'tour') {
                                    $options['tour'] = 1;
                                } else {
                                    if ($opt == 'cloud') {
                                        $options['cloud'] = 1;
                                    } else {
                                        if ($opt == 'freq') {
                                            $sort = 'freq';
                                        } else {
                                            if (($p = strpos($opt, '=')) !== false) {
                                                $k = substr($opt, 0, $p);
                                                $v = substr($opt, $p + 1);
                                                if ($k == 'limit') {
                                                    $limit = $v;
                                                } else {
                                                    if ($k == 'random') {
                                                        $options['all'] = 1;
                                                        $v = (int) $v;
                                                        $v = $v > 0 ? $v : 1;
                                                        $options['random'] = $v;
                                                    } else {
                                                        if ($k == 'sort' and in_array($v, array('freq', 'alpha'))) {
                                                            $sort = $v;
                                                        } else {
                                                            if ($k == 'type' and in_array($v, array('full', 'title'))) {
                                                                $search = $v . 'search';
                                                            } else {
                                                                if ($k == 'url') {
                                                                    $tag_link = $v;
                                                                    if (preg_match('/\\$TAG/', $tag_link) === false) {
                                                                        $tag_link .= '$TAG';
                                                                    }
                                                                }
                                                            }
                                                        }
                                                    }
                                                }
                                                // else ignore
                                            } else {
                                                $pagename = $opt;
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
    if (isset($options['random']) and empty($limit)) {
        $limit = 0;
    }
    if (isset($options['sort']) and $options['sort'] == 'freq') {
        $sort = 'freq';
    }
    if (empty($pagename)) {
        $pagename = $formatter->page->name;
    }
    # get cached keywords
    $cache = new Cache_text('keyword');
    $pkey = $pagename;
    $mc = new Cache_text('macro');
    $mkey = 'Keywords.' . md5($pagename . $value);
    $mykeys = array();
    # check cache mtime
    $cmt = $mc->mtime($mkey);
    $pmt = $cache->mtime($pkey);
    if ($cmt > $pmt) {
        # check update or not
        $dmt = $cache->mtime();
        if ($dmt > $cmt) {
            # XXX crude method
            $mykeys = array();
        } else {
            $mykeys = $mc->fetch($mkey);
        }
    } else {
        $mc->remove($mkey);
    }
    if (!$mykeys) {
        if (!empty($options['all'])) {
            $pages = $DBInfo->getPageLists();
        } else {
            $pages = array($pagename);
        }
        foreach ($pages as $pn) {
            if ($keys = $cache->fetch($pn)) {
                $mykeys = array_merge($mykeys, $keys);
            }
        }
        $mc->update($mkey, $mykeys);
    }
    if (!empty($options['all'])) {
        $use_sty = 1;
        $words = array_count_values($mykeys);
        unset($words['']);
        $ncount = array_sum($words);
        // total count
        arsort($words);
        $max = current($words);
        // get max hit number
        if (!empty($options['random'])) {
            $rws = array();
            $selected = array_rand($words, min($options['random'], count($words)));
            foreach ($selected as $k) {
                $rws[$k] = $words[$k];
            }
            $words =& $rws;
        }
        if ($sort != 'freq') {
            ksort($words);
        }
        #sort($words);
        #print $sort." $value";
        #print "<pre>";
        #print_r($words);
        #print "</pre>";
    } else {
        $max = 3;
        // default weight
        $words = array();
        foreach ($mykeys as $key) {
            $words[$key] = $max;
            // give weight to all selected keywords
        }
    }
    # automatically generate list of keywords
    if (empty($options['all']) and (empty($words) or isset($options['suggest']))) {
        $common = <<<EOF
am an a b c d e f g h i j k l m n o p q r s t u v w x y z
0 1 2 3 4 5 6 7 8 9
if on in by it at up as down over into for from to of he his him she her back
is are be being been or no not nor and all through under until
these there the top
with here only has had both did faw few little most almost much off on out
also each were was too any very more within then
across before behind beneath beyond after again against around among
so such since because but yet however ever during
it its the this that what where how when who whoever which their them
you your will shall may might we us our
get got would could have
can't won't didn't don't
aiff arj arts asp au avi bin biz css cgi com doc edu exe firm gif gz gzip
htm html info jpeg jpg js jsp mp3 mpeg mpg mov
nom pdf php pl qt ra ram rec shop sit tar tgz tiff txt wav web zip
one two three four five six seven eight nine ten eleven twelve
ftp http https www web net org or kr co us de
EOF;
        $page = $DBInfo->getPage($pagename);
        if (!$page->exists()) {
            return '';
        }
        $raw = $page->get_raw_body();
        $raw = rtrim($raw);
        // strip macros, entities
        $raw = preg_replace("/&[^;\\s]+;|\\[\\[[^\\[]+\\]\\]/", ' ', $raw);
        $raw = preg_replace("/^##.*\$/m", ' ', $raw);
        $raw = preg_replace("/([;\"',`\\\\\\/\\.:@#\\!\\?\$%\\^&\\*\\(\\)\\{\\}\\[\\]\\-_\\+=\\|<>])/", ' ', strip_tags($raw . ' ' . $pagename));
        // pagename also
        $raw = preg_replace("/((?<=[a-z0-9]|[B-Z]{2})([A-Z][a-z]))/", " \\1", $raw);
        $raw = strtolower($raw);
        $raw = preg_replace("/\\b/", ' ', $raw);
        //$raw=preg_replace("/\b([0-9a-zA-Z'\"])\\1+\s*/",' ',$raw);
        $words = preg_split("/\\s+|\n/", $raw);
        // remove common words
        $common_word_page0 = LOCAL_KEYWORDS . '/CommonWords';
        $lines0 = array();
        if ($DBInfo->hasPage($common_word_page0)) {
            $p = $DBInfo->getPage($common_word_page0);
            $lines0 = explode("\n", $p->get_raw_body());
        }
        $lang = isset($formatter->pi['#language']) ? $formatter->pi['#language'] : $DBInfo->default_language;
        if ($lang and in_array($lang, $supported_lang)) {
            $common_word_page = LOCAL_KEYWORDS . '/CommonWords' . ucfirst($lang);
            if ($DBInfo->hasPage($common_word_page)) {
                $p = $DBInfo->getPage($common_word_page);
                $lines = explode("\n", $p->get_raw_body());
                $lines = array_merge($lines, $lines0);
                foreach ($lines as $line) {
                    if (isset($line[0]) and $line[0] == '#') {
                        continue;
                    }
                    $common .= "\n" . $line;
                }
                $common = rtrim($common);
            }
        }
        $words = array_diff($words, preg_split("/\\s+|\n/", $common));
        while (!empty($DBInfo->use_stemmer)) {
            include_once dirname(__FILE__) . '/../lib/stemmer.ko.php';
            include_once dirname(__FILE__) . '/../lib/stemmer.php';
            $indexer = new KoreanStemmer();
            if (!is_resource($indexer->_dict)) {
                break;
            }
            $founds = array();
            foreach ($words as $key) {
                if (preg_match('/^[a-zA-Z0-9]+$/', $key)) {
                    // ignore alphanumeric
                    $stem = PorterStemmer::Stem($key);
                    $founds[] = $stem;
                    continue;
                }
                $match = null;
                $stem = $indexer->getStem(trim($key), $match, $type);
                if (!empty($stem)) {
                    $founds[] = $stem;
                } else {
                    if (!empty($last)) {
                        //print_r($match);
                    }
                }
            }
            $words = $founds;
            $indexer->close();
            break;
        }
        $preword = '';
        $bigwords = array();
        foreach ($words as $word) {
            if (strlen($word) > 2 and strlen($preword) > 2) {
                if ($word == $preword) {
                    continue;
                }
                $key = $preword . ' ' . $word;
                $rkey = $word . ' ' . $preword;
                if (isset($bigwords[$key])) {
                    $bigwords[$key]++;
                } else {
                    if (isset($bigwords[$rkey])) {
                        $bigwords[$rkey]++;
                    } else {
                        $bigwords[$key] = 1;
                    }
                }
            }
            $preword = $word;
        }
        $words = array_count_values($words);
        unset($words['']);
        $ncount = array_sum($words);
        // total count
        /*   
            $words=array_diff(array_keys($counts),preg_split("/\s+|\n/",$common));
        
            if (function_exists('array_intersect_key')) {
                $words=array_intersect_key($counts,$words);
            } else {
                $ret = array();
                foreach($words as $key) {
                    if(array_key_exists($key, $counts))
                        $ret[$key] = $counts[$key];
                }
                $words=&$ret;
            }
        */
        if ($bigwords) {
            //
            $bigwords = array_filter($bigwords, create_function('$a', 'return ($a != 1);'));
            foreach ($bigwords as $k => $v) {
                $words["{$k}"] = $v;
            }
        }
        arsort($words);
        $max = current($words);
        // get max hit number
        $nwords = array();
        if (isset($options['merge'])) {
            foreach ($mykeys as $key) {
                $nwords[$key] = $max;
                // give weight to all selected keywords
            }
        }
        if ($nwords) {
            foreach ($nwords as $k => $v) {
                $words[$k] = $v;
            }
        }
        $use_sty = 1;
    }
    //
    if (!empty($options['call'])) {
        return $words;
    }
    if ($limit and ($sz = sizeof($words)) > $limit) {
        arsort($words);
        $mywords = array_keys($words);
        $mywords = array_slice($mywords, 0, $limit);
        $nwords = array();
        foreach ($mywords as $k) {
            $nwords[$k] = $words[$k];
        }
        $words =& $nwords;
    }
    // make criteria list
    $fz = 0;
    $min = 0;
    $sty = array();
    if (!empty($use_sty)) {
        $fact = array();
        $weight = $max;
        // $ncount
        #print 'max='.$max.' ratio='.$weight/$ncount.':';
        $test = array(0.8, 0.6, 0.4, 0.5, 0.5, 0.5);
        // six level
        for ($i = 0; $i < 6 and $weight > 0; $i++) {
            $weight = (int) ($weight * $test[$i]);
            if ($weight > 0) {
                $fact[] = $weight;
            }
            #print $weight.'--';
        }
        $max = current($fact);
        $min = $limit ? max(1, end($fact)) - 1 : 0;
        // XXX
        // make font-size style
        $fz = max(sizeof($fact), 2);
        $fsh = (MAX_FONT_SZ - MIN_FONT_SZ) / ($fz - 1);
        $fs = MAX_FONT_SZ;
        // max font-size:24px;
        for ($i = 0; $i < $fz; $i++) {
            $ifs = (int) ($fs + 0.5);
            $sty[] = " style='font-size:{$ifs}px;'";
            #print '/'.$ifs;
            $fs -= $fsh;
            $fs = max($fs, 9);
            // min font-size:9px
        }
    }
    if (empty($sort) or $sort != 'freq') {
        ksort($words);
    }
    $link = $formatter->link_url(_rawurlencode($pagename), '');
    if (!isset($tag_link)) {
        if (empty($search)) {
            $search = 'fullsearch&amp;keywords=1';
        }
        if (!empty($options['tour'])) {
            $search = 'tour&amp;arena=keylinks';
        }
        $tag_link = $formatter->link_url(_rawurlencode($pagename), '?action=' . $search . '&amp;value=$TAG');
    }
    $out = '';
    if (!empty($options['add'])) {
        $out = "<form method='post' action='{$link}'>\n";
        $out .= "<input type='hidden' name='action' value='keywords' />\n";
    }
    if (isset($options['cloud'])) {
        $out = '';
        foreach ($words as $key => $val) {
            $style = $sty[$fz - 1];
            for ($i = 0; $i < $fz; $i++) {
                if ($val > $fact[$i]) {
                    $style = $sty[$i];
                    break;
                }
            }
            if ($val > $min) {
                $out .= "<a href='" . qualifiedUrl(str_replace('$TAG', $key, $tag_link)) . "'";
                if ($use_sty) {
                    $out .= ' ' . $style;
                } else {
                    $out .= " style='12'";
                }
                $out .= ">" . $key . "</a>";
            }
        }
        $out = preg_replace('/&amp;/', urlencode('&'), $out);
        $tout = "<a href='http://www.roytanck.com/tag1' style='font-size:20px'>Tag name</a><a href='http://www.roytanck.com/tag2' style='font-size:10px'>Tag two</a>";
        $formatter->register_javascripts(array('js/swfobject.js'));
        $_swf_prefix = qualifiedUrl("{$DBInfo->url_prefix}/local/wp-cumulus");
        // FIXME
        return <<<SWF
<script type="text/javascript">
var flashvars = {
   mode : "tags",
   distr : "true",
   tcolor : "0xffffff",
   tcolor2 : "0x86B9F2",
   hicolor : "0xBAD8F8",
   tagcloud : "<tags>{$out}</tags>"
};

var params = {
   wmode: "opaque",
   bgcolor: "#333333"
};

var attrs = {
   id: "myCloudContent"
};

swfobject.embedSWF("{$_swf_prefix}/tagcloud.swf", "myCloud", "200", "200", "9.0.0","expressInstall.swf", flashvars, params, attrs);
</script>
<div id="myCloud">
</div>
SWF;
    }
    $out .= '<ul>';
    $checkbox = '';
    foreach ($words as $key => $val) {
        $style = '';
        if ($fz > 0) {
            $style = $sty[$fz - 1];
            for ($i = 0; $i < $fz; $i++) {
                if ($val > $fact[$i]) {
                    $style = $sty[$i];
                    break;
                }
            }
        }
        if ($val > $min) {
            $checked = '';
            if ($val >= $max) {
                $checked = 'checked="checked"';
                $ok = 1;
            }
            if (!empty($options['add'])) {
                $checkbox = "<input type='checkbox' {$checked} name='key[]' " . "value='{$key}' />";
            }
            $out .= " <li class=\"tag-item\"";
            if (!empty($use_sty)) {
                $out .= " {$style} title=\"{$val} " . _("hits") . '"';
            }
            $out .= ">{$checkbox}" . "<a href='" . str_replace('$TAG', $key, $tag_link) . "' rel='nofollow'>" . $key . "</a></li>\n";
        }
    }
    $inp = '';
    $form_close = '';
    if (!empty($options['add'])) {
        $msg = _("add keywords");
        $inp = "<li><input type='text' name='keywords' size='12' />: {$msg}</li>";
        if ($ok) {
            $btn = _("Update keywords");
        } else {
            $btn = _("Add keywords");
        }
        $btn1 = _("Add as common words");
        $btn2 = _("Unselect all");
        $btnc = _("Suggest new Keywords");
        $form_close = "<input type='submit' value='{$btn}'/>\n";
        $form_close .= "<input type='submit' name='suggest' value='{$btnc}' />\n";
        $form_close .= "<input type='submit' name='common' value='{$btn1}' />\n";
        $form_close .= "<input type='button' value='{$btn2}' onClick='UncheckAll(this)' />\n";
        $form_close .= "<select name='lang'><option>---</option>\n";
        foreach ($supported_lang as $l) {
            $form_close .= "<option value='{$l}'>{$l}</option>\n";
        }
        $langmsg = _("select language");
        $form_close .= "</select>: {$langmsg}\n</form>\n";
        $form_close .= <<<EOF
<script type='text/javascript' src='{$DBInfo->url_prefix}/local/checkbox.js'>
</script>
EOF;
    }
    return "<div class='cloudView'>" . $out . "{$inp}</ul></div>{$form_close}";
}
Beispiel #3
0
 function _stemmingWords($words)
 {
     static $indexer = null;
     if ($this->use_stemming > 1) {
         include_once dirname(__FILE__) . '/stemmer.ko.php';
         if (empty($indexer)) {
             $indexer = new KoreanStemmer();
         }
         $founds = array();
         foreach ($words as $word) {
             if (preg_match('/[^0-9A-Za-z]/u', $word)) {
                 $match = null;
                 $stem = $indexer->getStem(trim($word), $match, $type);
                 if (!empty($stem)) {
                     $founds[] = $stem;
                 }
             } else {
                 $founds[] = $word;
             }
         }
         return $founds;
     }
     $new_words = array();
     foreach ($words as $k => $word) {
         if (!isset($word[0])) {
             continue;
         }
         if ($word[0] == "" and preg_match('/[^0-9A-Za-z]/u', $word)) {
             //$ret = $this->_fakeIndexWords($word, $new_words);
             $ret = $this->_chunkWords($word, $new_words, true);
             if ($ret) {
                 unset($words[$k]);
             }
             // XXX
         }
     }
     $words = array_unique(array_merge($words, $new_words));
     return $words;
 }