/** * Used to extract the title, description and links from * a string consisting of Word Doc data (2004 or earlier). * * @param string $page the web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $text = ""; if (is_string($page)) { $text = self::extractASCIIText($page); } if ($text == "") { $text = $url; } $summary = parent::process($text, $url); return $summary; }
/** * Used to extract the title, description and links from * a string consisting of rss news feed data. * * @param string $page web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; if (is_string($page)) { $dom = self::dom($page); if ($dom !== false) { $summary[self::TITLE] = $url; $summary[self::DESCRIPTION] = "Sitemap of " . $url; $summary[self::LANG] = "en-US"; $summary[self::LINKS] = self::links($dom, $url); if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0) { //maybe not a sitemap? treat as text still try to get urls $summary = parent::process($page, $url); } $summary[self::JUST_METAS] = true; } else { $summary = parent::process($page, $url); $summary[self::JUST_METAS] = true; } } return $summary; }
/** * Used to extract the title, description and links from * a string consisting of webpage data. * * @param string $page web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; $is_centroid = $this->summarizer_option == self::CENTROID_SUMMARIZER; if (is_string($page)) { $page = preg_replace('/\\ \\;|\\&rdquo\\;|\\&ldquo\\;|\\&mdash\\;/si', ' ', $page); $page = preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page); $dom_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ', $page); $dom = self::dom($dom_page); if ($dom !== false) { $summary[self::ROBOT_METAS] = self::getMetaRobots($dom); $summary[self::TITLE] = self::title($dom); if ($summary[self::TITLE] == "") { $summary[self::TITLE] = self::crudeTitle($dom_page); } $summary[self::LANG] = self::lang($dom, $summary[self::TITLE], $url); if ($is_centroid) { $summary_cloud = CentroidSummarizer::getCentroidSummary($dom_page, $summary[self::LANG]); $summary[self::DESCRIPTION] = $summary_cloud[0]; $summary[self::WORD_CLOUD] = $summary_cloud[1]; crawlLog("..Using Centroid Summarizer"); } else { $summary[self::DESCRIPTION] = self::description($dom, $dom_page); crawlLog("..Using Basic Summarizer"); } $crude = false; if (trim($summary[self::DESCRIPTION]) == "") { $summary[self::DESCRIPTION] = self::crudeDescription($dom_page); crawlLog("..No text extracted. " . "Invoked crude description fallback."); $crude = true; } $summary[self::LINKS] = self::links($dom, $url); if ($summary[self::LINKS] == array()) { $summary[self::LINKS] = parent::extractHttpHttpsUrls($page); } $location = self::location($dom, $url); if ($location) { $summary[self::LINKS][$location] = "location:" . $url; $summary[self::LOCATION] = true; $summary[self::DESCRIPTION] .= $url . " => " . $location; if (!$summary[self::TITLE]) { $summary[self::TITLE] = $url; } } if (!$crude && !$location) { $location = self::relCanonical($dom, $url); if ($location) { $summary[self::LINKS] = array(); $summary[self::LINKS][$location] = "location:" . $url; $summary[self::LOCATION] = true; if (!$summary[self::DESCRIPTION]) { $summary[self::DESCRIPTION] .= $url . " => " . $location; } if (!$summary[self::TITLE]) { $summary[self::TITLE] = $url; } } } $summary[self::PAGE] = $page; if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0 && !$location) { /*maybe not html? treat as text with messed up tags still try to get urls */ $summary_text = parent::process(strip_tags($page), $url); foreach ($summary as $field => $value) { if (($value == "" || $value == array()) && isset($summary_text[$field])) { $summary[$field] = $summary_text[$field]; } } } } else { if ($dom == false) { $summary = parent::process($page, $url); } } } return $summary; }
/** * Used to extract the title, description and links from * a string consisting of svg image. If the image is small * enough, an attempt is made to generate a thumbnail * * @param string $page web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { if (is_string($page)) { self::closeDanglingTags($page); $dom = self::dom($page); if ($dom !== false && isset($dom->documentElement)) { $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = self::description($dom); $summary[self::LINKS] = array(); $summary[self::PAGE] = "<html><body><div><img src='data:image/svg+xml;base64," . base64_encode($page) . "' alt='" . $summary[self::DESCRIPTION] . "' /></div></body></html>"; if (strlen($page) < self::MAX_THUMB_LEN) { $thumb_string = self::createThumb($dom); $summary[self::THUMB] = 'data:image/svg+xml;base64,' . base64_encode($thumb_string); } } else { $summary = parent::process($page, $url); } } return $summary; }
/** * Used to extract the title, description and links from * a string consisting of rss news feed data. * * @param string $page web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; if (is_string($page)) { self::closeDanglingTags($page); $dom = self::dom($page); $root_name = isset($dom->documentElement->nodeName) ? $dom->documentElement->nodeName : ""; unset($dom); $XML_PROCESSORS = array("rss" => "RssProcessor", "html" => "HtmlProcessor", "sitemapindex" => "SitemapProcessor", "urlset" => "SitemapProcessor", "svg" => "SvgProcessor"); if (isset($XML_PROCESSORS[$root_name])) { $processor_name = $XML_PROCESSORS[$root_name]; $processor = new $processor_name($this->plugin_instances); $summary = $processor->process($page, $url); } else { $summary = parent::process($page, $url); } } return $summary; }
/** * Computes a summary based on a string of a binary Powerpoint document * (as opposed to the modern xml powerpoint format). * * Text is extracted from the Powerpoint document using a crude finite * state machine that was developed by looking at a few Powerpoint * documents in a Hex editor. Then the TextProcessor:: process() method * is used to make a summary * * @param string $page string of a Powerpoint document * @param string $url location the document came from, not used by * TextProcessor at this point. Some of its subclasses override * this method and use url to produce complete links for * relative links within a document * * @return array a summary of (title, description,links, and content) of * the information in $page */ function process($page, $url) { $text = ""; if (is_string($page)) { $text_objects = array(); $cur_id = 0; $state = self::PPT_IGNORING; $cur_char_pos = 0; $len = strlen($page); while ($cur_char_pos < $len) { $ascii = ord($page[$cur_char_pos]); switch ($state) { case self::PPT_IGNORING: if ($ascii == 0) { $state = self::ZEROONE_IGNORING; } break; case self::ZEROONE_IGNORING: if ($ascii == 0) { $state = self::ZEROTWO_IGNORING; } else { $state = self::PPT_IGNORING; } break; case self::ZEROTWO_IGNORING: if ($ascii == 168) { $state = self::FIRST_CHAR_TEXT_SEG; } else { if ($ascii != 0) { $state = self::PPT_IGNORING; } } break; case self::FIRST_CHAR_TEXT_SEG: if ($ascii == 15) { $state = self::READ_LEN_TEXT_SEG; $text_len = 0; $text_len_pos = 0; } else { $state = self::PPT_IGNORING; } break; case self::READ_LEN_TEXT_SEG: if ($text_len_pos < 4) { $text_len += $ascii << $text_len_pos * 8; $text_len_pos++; } else { $state = self::SCAN_TEXT_SEG; $scan_text_pos = 0; $out_text = chr($ascii); } break; case self::SCAN_TEXT_SEG: if (strpos($out_text, "lick to edit Master title style") > 0) { $state = self::ALWAYS_IGNORE; } else { if ($scan_text_pos < $text_len) { if ($ascii >= 32 && $ascii <= 126 || $ascii == 10) { $out_text .= chr($ascii); $scan_text_pos++; } } else { $text_objects[$cur_id] = $out_text; $cur_id++; $state = self::PPT_IGNORING; } } break; case self::ALWAYS_IGNORE: break; } $cur_char_pos++; } $text = implode("\n", $text_objects); } if ($text == "") { $text = $url; } $summary = parent::process($text, $url); return $summary; }
/** * Used to extract the title, description and links from * a string consisting of rss or atom news feed data. * * @param string $page web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; if (is_string($page)) { $dom = self::dom($page); $atom = false; $feed_nodes = $dom->getElementsByTagName('feed'); if ($feed_nodes->length > 0) { $atom = true; } if ($dom !== false) { $summary[self::TITLE] = self::title($dom, $atom); $summary[self::DESCRIPTION] = self::description($dom, $atom); $summary[self::LANG] = self::lang($dom, $summary[self::DESCRIPTION]); $summary[self::LINKS] = self::links($dom, $url, $atom); if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0) { //maybe not rss or atom? treat as text still try to get urls $summary = parent::process($page, $url); } } } return $summary; }