예제 #1
0
 /**
  * Takes a list of words and returns them reduced to their stems.
  *
  * $words can be either a string or an array. If it is a string, it will
  * be split into separate words on whitespace, commas, or semicolons. If
  * an array, it assumes one word per element.
  *
  * @param  mixed $words
  *                            String or array of word(s) to reduce
  * @param  bool  $commonwords
  *                            Remove common words prior to stemming
  * @access public
  * @return array List of word stems
  */
 public function getStemmed($words, $commonwords = false)
 {
     if ($commonwords === true) {
         if (is_array($words)) {
             $words = implode(" ", $words);
             $words = \Rss\Text\CommonWords\CommonWords::removeCommonWords($words);
             $words = preg_replace('/\\s{2,}/', ' ', $words);
             $words = explode(" ", $words);
         }
         if (is_string($words)) {
             $words = \Rss\Text\CommonWords\CommonWords::removeCommonWords($words);
             $words = preg_replace('/\\s{2,}/', ' ', $words);
         }
     }
     $stemmer = new \Rss\Util\Contrib\Nlp\Stemmers\en\Stemmer\Stemmer();
     return $stemmer->stem_list($words);
 }
예제 #2
0
 /**
  *
  * @param array $item        	
  * @param bool $extinfo        	
  */
 private function process_item($item, $extinfo)
 {
     $item['title'] = $this->get_title($item['title']);
     $title = $item['title'];
     $item['description'] = $this->get_text($item['description']);
     $description = $item['description'];
     $item['content'] = $this->get_text($item['content']);
     $content = $item['content'];
     $item['link'] = $this->get_link($item['link']);
     $link = $item['link'];
     if (empty($link)) {
         $this->logger->logWarn("Empty item url ({$link})");
         return null;
     }
     $zuri = \Zend\Uri\UriFactory::factory($link);
     if (!$zuri->isValid()) {
         $this->logger->logWarn("Invalid item url ({$link})");
         return null;
     }
     $linkparts = parse_url($link);
     if (!isset($linkparts['host']) || empty($linkparts['host'])) {
         $this->logger->logWarn("Invalid item host ({$link})");
         return null;
     }
     if ($extinfo) {
         // Ext Title
         $titlenocommon = \Rss\Text\CommonWords\CommonWords::removeCommonWords($title);
         $stemmer = new \Rss\Text\Stemmer\PorterStemmer();
         $titlestemmed = $stemmer->getStemmed($titlenocommon);
         // Strip multiply spaces and remove line breaks
         $titlestemmed = preg_replace('/\\s+/m', ' ', $titlestemmed);
         $misc = new \Rss\Text\Misc\Misc();
         $titlemetaphone = $misc->getMetaphone($titlestemmed);
         $item['titlenocommon'] = $titlenocommon;
         $item['titlestemmed'] = $titlestemmed;
         $item['titlemetaphone'] = $titlemetaphone;
     }
     ksort($item);
     return $item;
 }