Beispiel #1
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of Word Doc data (2004 or earlier).
  *
  * @param string $page  the web-page contents
  * @param string $url  the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $text = "";
     if (is_string($page)) {
         $text = self::extractASCIIText($page);
     }
     if ($text == "") {
         $text = $url;
     }
     $summary = parent::process($text, $url);
     return $summary;
 }
Beispiel #2
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of rss news feed data.
  *
  * @param string $page   web-page contents
  * @param string $url   the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     if (is_string($page)) {
         $dom = self::dom($page);
         if ($dom !== false) {
             $summary[self::TITLE] = $url;
             $summary[self::DESCRIPTION] = "Sitemap of " . $url;
             $summary[self::LANG] = "en-US";
             $summary[self::LINKS] = self::links($dom, $url);
             if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0) {
                 //maybe not a sitemap? treat as text still try to get urls
                 $summary = parent::process($page, $url);
             }
             $summary[self::JUST_METAS] = true;
         } else {
             $summary = parent::process($page, $url);
             $summary[self::JUST_METAS] = true;
         }
     }
     return $summary;
 }
Beispiel #3
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of webpage data.
  *
  * @param string $page web-page contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     $is_centroid = $this->summarizer_option == self::CENTROID_SUMMARIZER;
     if (is_string($page)) {
         $page = preg_replace('/\\&nbsp\\;|\\&rdquo\\;|\\&ldquo\\;|\\&mdash\\;/si', ' ', $page);
         $page = preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page);
         $dom_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ', $page);
         $dom = self::dom($dom_page);
         if ($dom !== false) {
             $summary[self::ROBOT_METAS] = self::getMetaRobots($dom);
             $summary[self::TITLE] = self::title($dom);
             if ($summary[self::TITLE] == "") {
                 $summary[self::TITLE] = self::crudeTitle($dom_page);
             }
             $summary[self::LANG] = self::lang($dom, $summary[self::TITLE], $url);
             if ($is_centroid) {
                 $summary_cloud = CentroidSummarizer::getCentroidSummary($dom_page, $summary[self::LANG]);
                 $summary[self::DESCRIPTION] = $summary_cloud[0];
                 $summary[self::WORD_CLOUD] = $summary_cloud[1];
                 crawlLog("..Using Centroid Summarizer");
             } else {
                 $summary[self::DESCRIPTION] = self::description($dom, $dom_page);
                 crawlLog("..Using Basic Summarizer");
             }
             $crude = false;
             if (trim($summary[self::DESCRIPTION]) == "") {
                 $summary[self::DESCRIPTION] = self::crudeDescription($dom_page);
                 crawlLog("..No text extracted. " . "Invoked crude description fallback.");
                 $crude = true;
             }
             $summary[self::LINKS] = self::links($dom, $url);
             if ($summary[self::LINKS] == array()) {
                 $summary[self::LINKS] = parent::extractHttpHttpsUrls($page);
             }
             $location = self::location($dom, $url);
             if ($location) {
                 $summary[self::LINKS][$location] = "location:" . $url;
                 $summary[self::LOCATION] = true;
                 $summary[self::DESCRIPTION] .= $url . " => " . $location;
                 if (!$summary[self::TITLE]) {
                     $summary[self::TITLE] = $url;
                 }
             }
             if (!$crude && !$location) {
                 $location = self::relCanonical($dom, $url);
                 if ($location) {
                     $summary[self::LINKS] = array();
                     $summary[self::LINKS][$location] = "location:" . $url;
                     $summary[self::LOCATION] = true;
                     if (!$summary[self::DESCRIPTION]) {
                         $summary[self::DESCRIPTION] .= $url . " => " . $location;
                     }
                     if (!$summary[self::TITLE]) {
                         $summary[self::TITLE] = $url;
                     }
                 }
             }
             $summary[self::PAGE] = $page;
             if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0 && !$location) {
                 /*maybe not html? treat as text with messed up tags
                      still try to get urls
                   */
                 $summary_text = parent::process(strip_tags($page), $url);
                 foreach ($summary as $field => $value) {
                     if (($value == "" || $value == array()) && isset($summary_text[$field])) {
                         $summary[$field] = $summary_text[$field];
                     }
                 }
             }
         } else {
             if ($dom == false) {
                 $summary = parent::process($page, $url);
             }
         }
     }
     return $summary;
 }
Beispiel #4
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of svg image. If the image is small
  * enough, an attempt is made to generate a thumbnail
  *
  * @param string $page   web-page contents
  * @param string $url   the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     if (is_string($page)) {
         self::closeDanglingTags($page);
         $dom = self::dom($page);
         if ($dom !== false && isset($dom->documentElement)) {
             $summary[self::TITLE] = "";
             $summary[self::DESCRIPTION] = self::description($dom);
             $summary[self::LINKS] = array();
             $summary[self::PAGE] = "<html><body><div><img src='data:image/svg+xml;base64," . base64_encode($page) . "' alt='" . $summary[self::DESCRIPTION] . "' /></div></body></html>";
             if (strlen($page) < self::MAX_THUMB_LEN) {
                 $thumb_string = self::createThumb($dom);
                 $summary[self::THUMB] = 'data:image/svg+xml;base64,' . base64_encode($thumb_string);
             }
         } else {
             $summary = parent::process($page, $url);
         }
     }
     return $summary;
 }
Beispiel #5
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of rss news feed data.
  *
  * @param string $page   web-page contents
  * @param string $url   the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     if (is_string($page)) {
         self::closeDanglingTags($page);
         $dom = self::dom($page);
         $root_name = isset($dom->documentElement->nodeName) ? $dom->documentElement->nodeName : "";
         unset($dom);
         $XML_PROCESSORS = array("rss" => "RssProcessor", "html" => "HtmlProcessor", "sitemapindex" => "SitemapProcessor", "urlset" => "SitemapProcessor", "svg" => "SvgProcessor");
         if (isset($XML_PROCESSORS[$root_name])) {
             $processor_name = $XML_PROCESSORS[$root_name];
             $processor = new $processor_name($this->plugin_instances);
             $summary = $processor->process($page, $url);
         } else {
             $summary = parent::process($page, $url);
         }
     }
     return $summary;
 }
Beispiel #6
0
 /**
  * Computes a summary based on a string of a binary Powerpoint document
  * (as opposed to the modern xml powerpoint format).
  *
  * Text is extracted from the Powerpoint document using a crude finite
  * state machine that was developed by looking at a few Powerpoint
  * documents in a Hex editor. Then the TextProcessor:: process() method
  * is used to make a summary
  *
  * @param string $page string of a Powerpoint document
  * @param string $url location the document came from, not used by
  *     TextProcessor at this point. Some of its subclasses override
  *     this method and use url to produce complete links for
  *     relative links within a document
  *
  * @return array a summary of (title, description,links, and content) of
  *     the information in $page
  */
 function process($page, $url)
 {
     $text = "";
     if (is_string($page)) {
         $text_objects = array();
         $cur_id = 0;
         $state = self::PPT_IGNORING;
         $cur_char_pos = 0;
         $len = strlen($page);
         while ($cur_char_pos < $len) {
             $ascii = ord($page[$cur_char_pos]);
             switch ($state) {
                 case self::PPT_IGNORING:
                     if ($ascii == 0) {
                         $state = self::ZEROONE_IGNORING;
                     }
                     break;
                 case self::ZEROONE_IGNORING:
                     if ($ascii == 0) {
                         $state = self::ZEROTWO_IGNORING;
                     } else {
                         $state = self::PPT_IGNORING;
                     }
                     break;
                 case self::ZEROTWO_IGNORING:
                     if ($ascii == 168) {
                         $state = self::FIRST_CHAR_TEXT_SEG;
                     } else {
                         if ($ascii != 0) {
                             $state = self::PPT_IGNORING;
                         }
                     }
                     break;
                 case self::FIRST_CHAR_TEXT_SEG:
                     if ($ascii == 15) {
                         $state = self::READ_LEN_TEXT_SEG;
                         $text_len = 0;
                         $text_len_pos = 0;
                     } else {
                         $state = self::PPT_IGNORING;
                     }
                     break;
                 case self::READ_LEN_TEXT_SEG:
                     if ($text_len_pos < 4) {
                         $text_len += $ascii << $text_len_pos * 8;
                         $text_len_pos++;
                     } else {
                         $state = self::SCAN_TEXT_SEG;
                         $scan_text_pos = 0;
                         $out_text = chr($ascii);
                     }
                     break;
                 case self::SCAN_TEXT_SEG:
                     if (strpos($out_text, "lick to edit Master title style") > 0) {
                         $state = self::ALWAYS_IGNORE;
                     } else {
                         if ($scan_text_pos < $text_len) {
                             if ($ascii >= 32 && $ascii <= 126 || $ascii == 10) {
                                 $out_text .= chr($ascii);
                                 $scan_text_pos++;
                             }
                         } else {
                             $text_objects[$cur_id] = $out_text;
                             $cur_id++;
                             $state = self::PPT_IGNORING;
                         }
                     }
                     break;
                 case self::ALWAYS_IGNORE:
                     break;
             }
             $cur_char_pos++;
         }
         $text = implode("\n", $text_objects);
     }
     if ($text == "") {
         $text = $url;
     }
     $summary = parent::process($text, $url);
     return $summary;
 }
Beispiel #7
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of rss or atom news feed data.
  *
  * @param string $page   web-page contents
  * @param string $url   the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     if (is_string($page)) {
         $dom = self::dom($page);
         $atom = false;
         $feed_nodes = $dom->getElementsByTagName('feed');
         if ($feed_nodes->length > 0) {
             $atom = true;
         }
         if ($dom !== false) {
             $summary[self::TITLE] = self::title($dom, $atom);
             $summary[self::DESCRIPTION] = self::description($dom, $atom);
             $summary[self::LANG] = self::lang($dom, $summary[self::DESCRIPTION]);
             $summary[self::LINKS] = self::links($dom, $url, $atom);
             if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0) {
                 //maybe not rss or atom? treat as text still try to get urls
                 $summary = parent::process($page, $url);
             }
         }
     }
     return $summary;
 }