Exemplo n.º 1
0
 /**
  * Parse a string and turn it into an array of valid tokens
  *
  * @param string $string
  * @param bool $use_stopwords
  * @return array
  */
 static function parseTokens($string, $use_stopwords = true, $count = false)
 {
     // \w means alphanumeric characters.
     // Usually, non-English letters and numbers are included.
     // \W is the negated version of \w
     //
     // TODO: We're splitting on "anything that isn't a word" which is good
     // for languages with punctuation and spaces. But what about Chinese,
     // Japanese, and other languages that don't use them? How do we
     // identify tokens in those cases?
     $string = mb_strtolower($string);
     $rawtokens = mb_split("\\W", $string);
     if (!count($rawtokens)) {
         return array();
     }
     if ($use_stopwords && !is_array(self::$stopwords)) {
         //. Get stopwords
         self::$stopwords = array();
         $dir = dirname(__FILE__) . '/symbionts/stopwords';
         foreach (new DirectoryIterator($dir) as $file) {
             if (preg_match('/^[a-z]{2}\\.txt$/', $file)) {
                 self::$stopwords = array_merge(self::$stopwords, file("{$dir}/{$file}", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES));
             }
         }
         // Add generic internet cruft for good measure
         self::$stopwords = array_merge(self::$stopwords, array('http', 'https', 'mailto', 'www', 'com', 'net', 'org', 'biz', 'info'));
         // array_flip removes duplicates and increase speed by using isset() instead of in_array()
         self::$stopwords = array_flip(self::$stopwords);
     }
     // IMPORTANT:
     // If you change the number 64 below, you need to adjust
     // suxNaiveBayesian() and the corresponding token DB column accordingly
     $tokens = array();
     foreach ($rawtokens as $val) {
         if (!(empty($val) || mb_strlen($val) < 3 || mb_strlen($val) > 64 || ctype_digit($val) || isset(self::$stopwords[$val]))) {
             if ($count) {
                 @$tokens[$val]++;
             } else {
                 $tokens[] = $val;
             }
         }
     }
     return $tokens;
 }