Пример #1
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of webpage data.
  *
  * @param string $page web-page contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     $is_centroid = $this->summarizer_option == self::CENTROID_SUMMARIZER;
     if (is_string($page)) {
         $page = preg_replace('/\\&nbsp\\;|\\&rdquo\\;|\\&ldquo\\;|\\&mdash\\;/si', ' ', $page);
         $page = preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page);
         $dom_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ', $page);
         $dom = self::dom($dom_page);
         if ($dom !== false) {
             $summary[self::ROBOT_METAS] = self::getMetaRobots($dom);
             $summary[self::TITLE] = self::title($dom);
             if ($summary[self::TITLE] == "") {
                 $summary[self::TITLE] = self::crudeTitle($dom_page);
             }
             $summary[self::LANG] = self::lang($dom, $summary[self::TITLE], $url);
             if ($is_centroid) {
                 $summary_cloud = CentroidSummarizer::getCentroidSummary($dom_page, $summary[self::LANG]);
                 $summary[self::DESCRIPTION] = $summary_cloud[0];
                 $summary[self::WORD_CLOUD] = $summary_cloud[1];
                 crawlLog("..Using Centroid Summarizer");
             } else {
                 $summary[self::DESCRIPTION] = self::description($dom, $dom_page);
                 crawlLog("..Using Basic Summarizer");
             }
             $crude = false;
             if (trim($summary[self::DESCRIPTION]) == "") {
                 $summary[self::DESCRIPTION] = self::crudeDescription($dom_page);
                 crawlLog("..No text extracted. " . "Invoked crude description fallback.");
                 $crude = true;
             }
             $summary[self::LINKS] = self::links($dom, $url);
             if ($summary[self::LINKS] == array()) {
                 $summary[self::LINKS] = parent::extractHttpHttpsUrls($page);
             }
             $location = self::location($dom, $url);
             if ($location) {
                 $summary[self::LINKS][$location] = "location:" . $url;
                 $summary[self::LOCATION] = true;
                 $summary[self::DESCRIPTION] .= $url . " => " . $location;
                 if (!$summary[self::TITLE]) {
                     $summary[self::TITLE] = $url;
                 }
             }
             if (!$crude && !$location) {
                 $location = self::relCanonical($dom, $url);
                 if ($location) {
                     $summary[self::LINKS] = array();
                     $summary[self::LINKS][$location] = "location:" . $url;
                     $summary[self::LOCATION] = true;
                     if (!$summary[self::DESCRIPTION]) {
                         $summary[self::DESCRIPTION] .= $url . " => " . $location;
                     }
                     if (!$summary[self::TITLE]) {
                         $summary[self::TITLE] = $url;
                     }
                 }
             }
             $summary[self::PAGE] = $page;
             if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0 && !$location) {
                 /*maybe not html? treat as text with messed up tags
                      still try to get urls
                   */
                 $summary_text = parent::process(strip_tags($page), $url);
                 foreach ($summary as $field => $value) {
                     if (($value == "" || $value == array()) && isset($summary_text[$field])) {
                         $summary[$field] = $summary_text[$field];
                     }
                 }
             }
         } else {
             if ($dom == false) {
                 $summary = parent::process($page, $url);
             }
         }
     }
     return $summary;
 }
Пример #2
0
 /**
  * Computes a summary based on a text string of a document
  *
  * @param string $page text string of a document
  * @param string $url location the document came from, not used by
  *     TextProcessor at this point. Some of its subclasses override
  *     this method and use url to produce complete links for
  *     relative links within a document
  *
  * @return array a summary of (title, description,links, and content) of
  *     the information in $page
  */
 function process($page, $url)
 {
     $summary = NULL;
     if (is_string($page)) {
         $summary[self::TITLE] = "";
         $lang = self::calculateLang($page);
         if ($this->summarizer_option == self::CENTROID_SUMMARIZER) {
             $summary_cloud = CentroidSummarizer::getCentroidSummary($page, $lang);
             $summary[self::DESCRIPTION] = $summary_cloud[0];
             $summary[self::WORD_CLOUD] = $summary_cloud[1];
         } else {
             $summary[self::DESCRIPTION] = mb_substr($page, 0, self::$max_description_len);
         }
         $summary[self::LANG] = self::calculateLang($summary[self::DESCRIPTION]);
         $summary[self::LINKS] = self::extractHttpHttpsUrls($page);
         $summary[self::PAGE] = "<html><body><div><pre>" . strip_tags($page) . "</pre></div></body></html>";
     }
     return $summary;
 }