/** * Used to extract the title, description and links from * a string consisting of webpage data. * * @param string $page web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; $is_centroid = $this->summarizer_option == self::CENTROID_SUMMARIZER; if (is_string($page)) { $page = preg_replace('/\\ \\;|\\&rdquo\\;|\\&ldquo\\;|\\&mdash\\;/si', ' ', $page); $page = preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page); $dom_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ', $page); $dom = self::dom($dom_page); if ($dom !== false) { $summary[self::ROBOT_METAS] = self::getMetaRobots($dom); $summary[self::TITLE] = self::title($dom); if ($summary[self::TITLE] == "") { $summary[self::TITLE] = self::crudeTitle($dom_page); } $summary[self::LANG] = self::lang($dom, $summary[self::TITLE], $url); if ($is_centroid) { $summary_cloud = CentroidSummarizer::getCentroidSummary($dom_page, $summary[self::LANG]); $summary[self::DESCRIPTION] = $summary_cloud[0]; $summary[self::WORD_CLOUD] = $summary_cloud[1]; crawlLog("..Using Centroid Summarizer"); } else { $summary[self::DESCRIPTION] = self::description($dom, $dom_page); crawlLog("..Using Basic Summarizer"); } $crude = false; if (trim($summary[self::DESCRIPTION]) == "") { $summary[self::DESCRIPTION] = self::crudeDescription($dom_page); crawlLog("..No text extracted. " . "Invoked crude description fallback."); $crude = true; } $summary[self::LINKS] = self::links($dom, $url); if ($summary[self::LINKS] == array()) { $summary[self::LINKS] = parent::extractHttpHttpsUrls($page); } $location = self::location($dom, $url); if ($location) { $summary[self::LINKS][$location] = "location:" . $url; $summary[self::LOCATION] = true; $summary[self::DESCRIPTION] .= $url . " => " . $location; if (!$summary[self::TITLE]) { $summary[self::TITLE] = $url; } } if (!$crude && !$location) { $location = self::relCanonical($dom, $url); if ($location) { $summary[self::LINKS] = array(); $summary[self::LINKS][$location] = "location:" . $url; $summary[self::LOCATION] = true; if (!$summary[self::DESCRIPTION]) { $summary[self::DESCRIPTION] .= $url . " => " . $location; } if (!$summary[self::TITLE]) { $summary[self::TITLE] = $url; } } } $summary[self::PAGE] = $page; if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0 && !$location) { /*maybe not html? treat as text with messed up tags still try to get urls */ $summary_text = parent::process(strip_tags($page), $url); foreach ($summary as $field => $value) { if (($value == "" || $value == array()) && isset($summary_text[$field])) { $summary[$field] = $summary_text[$field]; } } } } else { if ($dom == false) { $summary = parent::process($page, $url); } } } return $summary; }
/** * Computes a summary based on a text string of a document * * @param string $page text string of a document * @param string $url location the document came from, not used by * TextProcessor at this point. Some of its subclasses override * this method and use url to produce complete links for * relative links within a document * * @return array a summary of (title, description,links, and content) of * the information in $page */ function process($page, $url) { $summary = NULL; if (is_string($page)) { $summary[self::TITLE] = ""; $lang = self::calculateLang($page); if ($this->summarizer_option == self::CENTROID_SUMMARIZER) { $summary_cloud = CentroidSummarizer::getCentroidSummary($page, $lang); $summary[self::DESCRIPTION] = $summary_cloud[0]; $summary[self::WORD_CLOUD] = $summary_cloud[1]; } else { $summary[self::DESCRIPTION] = mb_substr($page, 0, self::$max_description_len); } $summary[self::LANG] = self::calculateLang($summary[self::DESCRIPTION]); $summary[self::LINKS] = self::extractHttpHttpsUrls($page); $summary[self::PAGE] = "<html><body><div><pre>" . strip_tags($page) . "</pre></div></body></html>"; } return $summary; }