Пример #1
0
 /**
  * Generates a centroid with which every sentence is ranked with cosine
  * ranking method and also generates a word cloud.
  * @param string $doc complete raw page to generate the summary from.
  * @param string $lang language of the page to decide which stop words to
  *     call proper tokenizer.php of the specified language.
  *
  * @return array array of summary and word cloud
  */
 static function getCentroidSummary($doc, $lang)
 {
     $doc = self::pageProcessing($doc);
     /* Format the document to remove characters other than periods and
           alphanumerics.
        */
     $formatted_doc = self::formatDoc($doc);
     $stop_obj = PhraseParser::getTokenizer($lang);
     /* Splitting into sentences */
     $out_sentences = self::getSentences($doc);
     $n = count($out_sentences);
     $sentences = array();
     if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) {
         for ($i = 0; $i < $n; $i++) {
             $sentences[$i] = $stop_obj->stopwordsRemover(self::formatDoc($out_sentences[$i]));
         }
     } else {
         $sentences = $out_sentences;
     }
     /*  Splitting into terms */
     $terms = array();
     foreach ($sentences as $sentence) {
         $terms = array_merge($terms, PhraseParser::segmentSegment($sentence, $lang));
     }
     $terms = array_filter($terms);
     $terms_counts = array_count_values($terms);
     arsort($terms_counts);
     $terms_counts = array_slice($terms_counts, 0, self::MAX_DISTINCT_TERMS);
     $terms = array_unique(array_keys($terms_counts));
     $t = count($terms);
     if ($t == 0) {
         return array("", "");
     }
     /* Initialize Nk array(Number of sentences the term occurs) */
     $nk = array();
     $nk = array_fill(0, $t, 0);
     $nt = array();
     /* Count TF for each word */
     for ($i = 0; $i < $n; $i++) {
         for ($j = 0; $j < $t; $j++) {
             if (strpos($sentences[$i], $terms[$j]) !== false) {
                 $nk[$j]++;
             }
         }
     }
     /* Calculate weights of each term for every sentence */
     $w = array();
     $idf = array();
     $idf_temp = 0;
     for ($k = 0; $k < $t; $k++) {
         if ($nk[$k] == 0) {
             $idf_temp = 0;
             $tmp = 0;
         } else {
             $idf_temp = $n / $nk[$k];
             $tmp = log($idf_temp);
         }
         $idf[$k] = $tmp;
     }
     /* Count TF for finding centroid */
     $wc = array();
     $max_nt = -1;
     $b = "\\b";
     if (in_array($lang, array("zh-CN", "ja", "ko"))) {
         $b = "";
     }
     for ($j = 0; $j < $t; $j++) {
         $nt = @preg_match_all("/{$b}{$terms[$j]}{$b}/", $formatted_doc, $matches);
         //$matches included for backwards compatibility
         $wc[$j] = $nt * $idf[$j];
         if (is_nan($wc[$j]) || is_infinite($wc[$j])) {
             $wc[$j] = 0;
         }
     }
     /* Calculate centroid */
     arsort($wc);
     $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true);
     /* Initializing centroid weight array by 0 */
     $wc = array_fill(0, $t, 0);
     /* Word cloud */
     $i = 0;
     $word_cloud = array();
     foreach ($centroid as $key => $value) {
         $wc[$key] = $value;
         if ($i < self::WORD_CLOUD_LEN) {
             $word_cloud[$i] = $terms[$key];
         }
         $i++;
     }
     if (strlen($formatted_doc) < PageProcessor::$max_description_len || $n == 1) {
         //if input short only use above to get a word cloud
         $formatted_doc = substr($formatted_doc, 0, PageProcessor::$max_description_len);
         return array($formatted_doc, $word_cloud);
     }
     ksort($wc);
     /* Calculate similarity measure between centroid and each sentence */
     $sim = array();
     for ($i = 0; $i < $n; $i++) {
         $a = $b1 = $b2 = $c1 = $c2 = $d = 0;
         for ($k = 0; $k < $t; $k++) {
             $wck = $wc[$k];
             $idfk = $idf[$k];
             $tmp = substr_count($sentences[$i], $terms[$k]);
             $wik = $tmp > 0 ? $idfk * (1 + log($tmp)) : 0;
             $a += $wik * $wck * $idfk;
             $b1 += $wik * $wik;
             $c1 += $wck * $wck;
         }
         $b2 = sqrt($b1);
         $c2 = sqrt($c1);
         $d = $b2 * $c2;
         if ($d == 0) {
             $sim[$i] = 0;
         } else {
             $sim[$i] = $a / $d;
         }
     }
     arsort($sim);
     /* Getting how many sentences should be there in summary */
     $top = self::summarySentenceCount($out_sentences, $sim);
     $sum_array = array();
     $sum_array = array_slice($sim, 0, $top - 1, true);
     ksort($sum_array);
     /* Printing Summary */
     $summary = '';
     foreach ($sum_array as $key => $value) {
         $summary .= "{$out_sentences[$key]}" . ". ";
     }
     /* Summary of text summarization */
     return array($summary, $word_cloud);
 }