PHP PhraseParser::segmentSegment Exemples

Langage de programmation: PHP

Class/Type: PhraseParser

Méthode/Fonction: segmentSegment

Exemples au hotexamples.com: 1

PHP PhraseParser::segmentSegment - 1 exemples trouvés. Ce sont les exemples réels les mieux notés de PhraseParser::segmentSegment extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

extractPhrasesInLists(6)

getTokenizer(5)

computeSafeSearchScore(4)

calculateMetas(3)

calculateLinkMetas(2)

canonicalizePunctuatedTerms(1)

extractPhrases(1)

extractPhrasesAndCount(1)

extractWordStringPageSummary(1)

getCharGramsTerm(1)

getCosineRank(1)

getIntersection(1)

reverseMaximalMatch(1)

segmentSegment(1)

stemCharGramSegment(1)

stemTerms(1)

Méthodes fréquemment utilisées

extractPhrasesInLists (6)

getTokenizer (5)

computeSafeSearchScore (4)

calculateMetas (3)

calculateLinkMetas (2)

canonicalizePunctuatedTerms (1)

extractPhrases (1)

extractPhrasesAndCount (1)

extractWordStringPageSummary (1)

getCharGramsTerm (1)

Méthodes fréquemment utilisées

getCosineRank (1)

getIntersection (1)

reverseMaximalMatch (1)

segmentSegment (1)

stemCharGramSegment (1)

stemTerms (1)

Exemple #1

0

Afficher le fichier

Fichier : centroid_summarizer.php Projet : yakar/yioop

/** * Generates a centroid with which every sentence is ranked with cosine * ranking method and also generates a word cloud. * @param string $doc complete raw page to generate the summary from. * @param string $lang language of the page to decide which stop words to * call proper tokenizer.php of the specified language. * * @return array array of summary and word cloud */ static function getCentroidSummary($doc, $lang) { $doc = self::pageProcessing($doc); /* Format the document to remove characters other than periods and alphanumerics. */ $formatted_doc = self::formatDoc($doc); $stop_obj = PhraseParser::getTokenizer($lang); /* Splitting into sentences */ $out_sentences = self::getSentences($doc); $n = count($out_sentences); $sentences = array(); if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) { for ($i = 0; $i < $n; $i++) { $sentences[$i] = $stop_obj->stopwordsRemover(self::formatDoc($out_sentences[$i])); } } else { $sentences = $out_sentences; } /* Splitting into terms */ $terms = array(); foreach ($sentences as $sentence) { $terms = array_merge($terms, PhraseParser::segmentSegment($sentence, $lang)); } $terms = array_filter($terms); $terms_counts = array_count_values($terms); arsort($terms_counts); $terms_counts = array_slice($terms_counts, 0, self::MAX_DISTINCT_TERMS); $terms = array_unique(array_keys($terms_counts)); $t = count($terms); if ($t == 0) { return array("", ""); } /* Initialize Nk array(Number of sentences the term occurs) */ $nk = array(); $nk = array_fill(0, $t, 0); $nt = array(); /* Count TF for each word */ for ($i = 0; $i < $n; $i++) { for ($j = 0; $j < $t; $j++) { if (strpos($sentences[$i], $terms[$j]) !== false) { $nk[$j]++; } } } /* Calculate weights of each term for every sentence */ $w = array(); $idf = array(); $idf_temp = 0; for ($k = 0; $k < $t; $k++) { if ($nk[$k] == 0) { $idf_temp = 0; $tmp = 0; } else { $idf_temp = $n / $nk[$k]; $tmp = log($idf_temp); } $idf[$k] = $tmp; } /* Count TF for finding centroid */ $wc = array(); $max_nt = -1; $b = "\\b"; if (in_array($lang, array("zh-CN", "ja", "ko"))) { $b = ""; } for ($j = 0; $j < $t; $j++) { $nt = @preg_match_all("/{$b}{$terms[$j]}{$b}/", $formatted_doc, $matches); //$matches included for backwards compatibility $wc[$j] = $nt * $idf[$j]; if (is_nan($wc[$j]) || is_infinite($wc[$j])) { $wc[$j] = 0; } } /* Calculate centroid */ arsort($wc); $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true); /* Initializing centroid weight array by 0 */ $wc = array_fill(0, $t, 0); /* Word cloud */ $i = 0; $word_cloud = array(); foreach ($centroid as $key => $value) { $wc[$key] = $value; if ($i < self::WORD_CLOUD_LEN) { $word_cloud[$i] = $terms[$key]; } $i++; } if (strlen($formatted_doc) < PageProcessor::$max_description_len || $n == 1) { //if input short only use above to get a word cloud $formatted_doc = substr($formatted_doc, 0, PageProcessor::$max_description_len); return array($formatted_doc, $word_cloud); } ksort($wc); /* Calculate similarity measure between centroid and each sentence */ $sim = array(); for ($i = 0; $i < $n; $i++) { $a = $b1 = $b2 = $c1 = $c2 = $d = 0; for ($k = 0; $k < $t; $k++) { $wck = $wc[$k]; $idfk = $idf[$k]; $tmp = substr_count($sentences[$i], $terms[$k]); $wik = $tmp > 0 ? $idfk * (1 + log($tmp)) : 0; $a += $wik * $wck * $idfk; $b1 += $wik * $wik; $c1 += $wck * $wck; } $b2 = sqrt($b1); $c2 = sqrt($c1); $d = $b2 * $c2; if ($d == 0) { $sim[$i] = 0; } else { $sim[$i] = $a / $d; } } arsort($sim); /* Getting how many sentences should be there in summary */ $top = self::summarySentenceCount($out_sentences, $sim); $sum_array = array(); $sum_array = array_slice($sim, 0, $top - 1, true); ksort($sum_array); /* Printing Summary */ $summary = ''; foreach ($sum_array as $key => $value) { $summary .= "{$out_sentences[$key]}" . ". "; } /* Summary of text summarization */ return array($summary, $word_cloud); }