/** * Returns the top ranked words * @param int $amount * @return array */ public function getKeywords($amount = 10) { // Map words to their normalized version $originalToNormalizedMap = []; $isNormalizedWordIncluded = []; foreach ($this->wordWeight as $word => $weight) { $normalized = $this->datasetAdapter->getLemmaFromForm($word); $originalToNormalizedMap[$word] = $normalized; $isNormalizedWordIncluded[$normalized] = false; } // Sort them by weight $keywords = $this->wordWeight; arsort($keywords); $keywords = array_keys($keywords); $keywords = array_filter($keywords, function ($content) { $include = true; if (is_numeric($content)) { $include = false; } $contentFiltered = preg_replace("/[^a-zA-Z0-9]+/", "", $content); if (strlen($contentFiltered) < 3) { $include = false; } return $include; }); // Remove keywords that are already included in a different format foreach ($keywords as $index => $keyword) { if ($isNormalizedWordIncluded[$originalToNormalizedMap[$keyword]] == false) { $isNormalizedWordIncluded[$originalToNormalizedMap[$keyword]] = true; } else { unset($keywords[$index]); } } $keywords = array_slice($keywords, 0, $amount); return $keywords; }
/** * Returns IDF for the given word form. * @param $wordForm * @return float */ public function getInverseDocumentFrequency($wordForm) { $frequency = $this->datasetAdapter->getWordFrequency($wordForm); return log($this->datasetAdapter->getExaminedDocumentCount() / $frequency); }