/** * Generates a centroid with which every sentence is ranked with cosine * ranking method and also generates a word cloud. * @param string $doc complete raw page to generate the summary from. * @param string $lang language of the page to decide which stop words to * call proper tokenizer.php of the specified language. * * @return array array of summary and word cloud */ static function getCentroidSummary($doc, $lang) { $doc = self::pageProcessing($doc); /* Format the document to remove characters other than periods and alphanumerics. */ $formatted_doc = self::formatDoc($doc); $stop_obj = PhraseParser::getTokenizer($lang); /* Splitting into sentences */ $out_sentences = self::getSentences($doc); $n = count($out_sentences); $sentences = array(); if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) { for ($i = 0; $i < $n; $i++) { $sentences[$i] = $stop_obj->stopwordsRemover(self::formatDoc($out_sentences[$i])); } } else { $sentences = $out_sentences; } /* Splitting into terms */ $terms = array(); foreach ($sentences as $sentence) { $terms = array_merge($terms, PhraseParser::segmentSegment($sentence, $lang)); } $terms = array_filter($terms); $terms_counts = array_count_values($terms); arsort($terms_counts); $terms_counts = array_slice($terms_counts, 0, self::MAX_DISTINCT_TERMS); $terms = array_unique(array_keys($terms_counts)); $t = count($terms); if ($t == 0) { return array("", ""); } /* Initialize Nk array(Number of sentences the term occurs) */ $nk = array(); $nk = array_fill(0, $t, 0); $nt = array(); /* Count TF for each word */ for ($i = 0; $i < $n; $i++) { for ($j = 0; $j < $t; $j++) { if (strpos($sentences[$i], $terms[$j]) !== false) { $nk[$j]++; } } } /* Calculate weights of each term for every sentence */ $w = array(); $idf = array(); $idf_temp = 0; for ($k = 0; $k < $t; $k++) { if ($nk[$k] == 0) { $idf_temp = 0; $tmp = 0; } else { $idf_temp = $n / $nk[$k]; $tmp = log($idf_temp); } $idf[$k] = $tmp; } /* Count TF for finding centroid */ $wc = array(); $max_nt = -1; $b = "\\b"; if (in_array($lang, array("zh-CN", "ja", "ko"))) { $b = ""; } for ($j = 0; $j < $t; $j++) { $nt = @preg_match_all("/{$b}{$terms[$j]}{$b}/", $formatted_doc, $matches); //$matches included for backwards compatibility $wc[$j] = $nt * $idf[$j]; if (is_nan($wc[$j]) || is_infinite($wc[$j])) { $wc[$j] = 0; } } /* Calculate centroid */ arsort($wc); $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true); /* Initializing centroid weight array by 0 */ $wc = array_fill(0, $t, 0); /* Word cloud */ $i = 0; $word_cloud = array(); foreach ($centroid as $key => $value) { $wc[$key] = $value; if ($i < self::WORD_CLOUD_LEN) { $word_cloud[$i] = $terms[$key]; } $i++; } if (strlen($formatted_doc) < PageProcessor::$max_description_len || $n == 1) { //if input short only use above to get a word cloud $formatted_doc = substr($formatted_doc, 0, PageProcessor::$max_description_len); return array($formatted_doc, $word_cloud); } ksort($wc); /* Calculate similarity measure between centroid and each sentence */ $sim = array(); for ($i = 0; $i < $n; $i++) { $a = $b1 = $b2 = $c1 = $c2 = $d = 0; for ($k = 0; $k < $t; $k++) { $wck = $wc[$k]; $idfk = $idf[$k]; $tmp = substr_count($sentences[$i], $terms[$k]); $wik = $tmp > 0 ? $idfk * (1 + log($tmp)) : 0; $a += $wik * $wck * $idfk; $b1 += $wik * $wik; $c1 += $wck * $wck; } $b2 = sqrt($b1); $c2 = sqrt($c1); $d = $b2 * $c2; if ($d == 0) { $sim[$i] = 0; } else { $sim[$i] = $a / $d; } } arsort($sim); /* Getting how many sentences should be there in summary */ $top = self::summarySentenceCount($out_sentences, $sim); $sum_array = array(); $sum_array = array_slice($sim, 0, $top - 1, true); ksort($sum_array); /* Printing Summary */ $summary = ''; foreach ($sum_array as $key => $value) { $summary .= "{$out_sentences[$key]}" . ". "; } /* Summary of text summarization */ return array($summary, $word_cloud); }