/**
  * Returns the top ranked words
  * @param int $amount
  * @return array
  */
 public function getKeywords($amount = 10)
 {
     // Map words to their normalized version
     $originalToNormalizedMap = [];
     $isNormalizedWordIncluded = [];
     foreach ($this->wordWeight as $word => $weight) {
         $normalized = $this->datasetAdapter->getLemmaFromForm($word);
         $originalToNormalizedMap[$word] = $normalized;
         $isNormalizedWordIncluded[$normalized] = false;
     }
     // Sort them by weight
     $keywords = $this->wordWeight;
     arsort($keywords);
     $keywords = array_keys($keywords);
     $keywords = array_filter($keywords, function ($content) {
         $include = true;
         if (is_numeric($content)) {
             $include = false;
         }
         $contentFiltered = preg_replace("/[^a-zA-Z0-9]+/", "", $content);
         if (strlen($contentFiltered) < 3) {
             $include = false;
         }
         return $include;
     });
     // Remove keywords that are already included in a different format
     foreach ($keywords as $index => $keyword) {
         if ($isNormalizedWordIncluded[$originalToNormalizedMap[$keyword]] == false) {
             $isNormalizedWordIncluded[$originalToNormalizedMap[$keyword]] = true;
         } else {
             unset($keywords[$index]);
         }
     }
     $keywords = array_slice($keywords, 0, $amount);
     return $keywords;
 }
示例#2
0
 /**
  * Returns IDF for the given word form.
  * @param $wordForm
  * @return float
  */
 public function getInverseDocumentFrequency($wordForm)
 {
     $frequency = $this->datasetAdapter->getWordFrequency($wordForm);
     return log($this->datasetAdapter->getExaminedDocumentCount() / $frequency);
 }