/** * Takes a list of words and returns them reduced to their stems. * * $words can be either a string or an array. If it is a string, it will * be split into separate words on whitespace, commas, or semicolons. If * an array, it assumes one word per element. * * @param mixed $words * String or array of word(s) to reduce * @param bool $commonwords * Remove common words prior to stemming * @access public * @return array List of word stems */ public function getStemmed($words, $commonwords = false) { if ($commonwords === true) { if (is_array($words)) { $words = implode(" ", $words); $words = \Rss\Text\CommonWords\CommonWords::removeCommonWords($words); $words = preg_replace('/\\s{2,}/', ' ', $words); $words = explode(" ", $words); } if (is_string($words)) { $words = \Rss\Text\CommonWords\CommonWords::removeCommonWords($words); $words = preg_replace('/\\s{2,}/', ' ', $words); } } $stemmer = new \Rss\Util\Contrib\Nlp\Stemmers\en\Stemmer\Stemmer(); return $stemmer->stem_list($words); }
/** * * @param array $item * @param bool $extinfo */ private function process_item($item, $extinfo) { $item['title'] = $this->get_title($item['title']); $title = $item['title']; $item['description'] = $this->get_text($item['description']); $description = $item['description']; $item['content'] = $this->get_text($item['content']); $content = $item['content']; $item['link'] = $this->get_link($item['link']); $link = $item['link']; if (empty($link)) { $this->logger->logWarn("Empty item url ({$link})"); return null; } $zuri = \Zend\Uri\UriFactory::factory($link); if (!$zuri->isValid()) { $this->logger->logWarn("Invalid item url ({$link})"); return null; } $linkparts = parse_url($link); if (!isset($linkparts['host']) || empty($linkparts['host'])) { $this->logger->logWarn("Invalid item host ({$link})"); return null; } if ($extinfo) { // Ext Title $titlenocommon = \Rss\Text\CommonWords\CommonWords::removeCommonWords($title); $stemmer = new \Rss\Text\Stemmer\PorterStemmer(); $titlestemmed = $stemmer->getStemmed($titlenocommon); // Strip multiply spaces and remove line breaks $titlestemmed = preg_replace('/\\s+/m', ' ', $titlestemmed); $misc = new \Rss\Text\Misc\Misc(); $titlemetaphone = $misc->getMetaphone($titlestemmed); $item['titlenocommon'] = $titlenocommon; $item['titlestemmed'] = $titlestemmed; $item['titlemetaphone'] = $titlemetaphone; } ksort($item); return $item; }