예제 #1
0
 /**
  * Returns up to MAX_LINKS_TO_EXTRACT many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $base_refs = $xpath->evaluate("/html//base");
     if ($base_refs->item(0)) {
         $tmp_site = $base_refs->item(0)->getAttribute('href');
         if (strlen($tmp_site) > 0) {
             $site = UrlParser::canonicalLink($tmp_site, $site);
         }
     }
     $i = 0;
     $hrefs = $xpath->evaluate("/html/body//a");
     foreach ($hrefs as $href) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $rel = $href->getAttribute("rel");
             if ($rel == "" || !stristr($rel, "nofollow")) {
                 $url = UrlParser::canonicalLink($href->getAttribute('href'), $site);
                 $len = strlen($url);
                 if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                     $text = $href->nodeValue;
                     if (isset($sites[$url])) {
                         $sites[$url] .= " .. " . preg_replace("/\\s+/", " ", strip_tags($text));
                         $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                     } else {
                         $sites[$url] = preg_replace("/\\s+/", " ", strip_tags($text));
                         $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                     }
                     $i++;
                 }
             }
         }
     }
     $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe");
     foreach ($frames as $frame) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $url = UrlParser::canonicalLink($frame->getAttribute('src'), $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " .. HTMLframe";
                 } else {
                     $sites[$url] = "HTMLframe";
                 }
                 $i++;
             }
         }
     }
     $imgs = $xpath->evaluate("/html/body//img[@alt]");
     $i = 0;
     foreach ($imgs as $img) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $alt = $img->getAttribute('alt');
             if (strlen($alt) < 1) {
                 continue;
             }
             $url = UrlParser::canonicalLink($img->getAttribute('src'), $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " .. " . $alt;
                     $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                 } else {
                     $sites[$url] = $alt;
                     $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                 }
                 $i++;
             }
         }
     }
     return $sites;
 }
예제 #2
0
 /**
  * Returns links from the supplied dom object of a sitemap
  * where links have been canonicalized according to
  * the supplied $site information. We allow more links from a sitemap
  * than from other kinds of documents. For now we are ignoring weighting
  * info
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $xpath->registerNamespace('s', "http://www.sitemaps.org/schemas/sitemap/0.9");
     $paths = array("/s:urlset/s:url/s:loc", "/s:sitemapindex/s:sitemap/s:loc");
     $i = 0;
     foreach ($paths as $path) {
         $nodes = @$xpath->evaluate($path);
         foreach ($nodes as $node) {
             $url = UrlParser::canonicalLink($node->textContent, $site);
             if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || UrlParser::getDocumentType($url) == "gz" || strlen($url) >= MAX_URL_LEN) {
                 //at this point we can't handle gzip'd sitemaps
                 continue;
             }
             $sites[$url] = "From sitemap of " . $site;
             $i++;
             if ($i > MAX_LINKS_PER_SITEMAP) {
                 break 2;
             }
         }
     }
     return $sites;
 }
예제 #3
0
 /**
  * Parses the contents of a robots.txt page extracting allowed,
  * disallowed paths, crawl-delay, and sitemaps. We also extract a
  * list of all user agent strings seen.
  *
  * @param string $page text string of a document
  * @param string $url location the document came from, not used by
  *     TextProcessor at this point. Some of its subclasses override
  *     this method and use url to produce complete links for
  *     relative links within a document
  *
  * @return array a summary of (title, description, links, and content) of
  *     the information in $page
  */
 function process($page, $url)
 {
     $summary = NULL;
     $summary[self::TITLE] = "";
     $summary[self::DESCRIPTION] = "";
     $summary[self::LANG] = NULL;
     $summary[self::ROBOT_PATHS] = array();
     $summary[self::AGENT_LIST] = array();
     $summary[self::LINKS] = array();
     $host_url = UrlParser::getHost($url);
     $lines = explode("\n", $page);
     $add_rule_state = false;
     $rule_added_flag = false;
     $delay_flag = false;
     $delay = 0;
     foreach ($lines as $pre_line) {
         $pre_line_parts = explode("#", $pre_line);
         $line = $pre_line_parts[0];
         $line_parts = explode(":", $line);
         if (!isset($line_parts[1])) {
             continue;
         }
         $field = array_shift($line_parts);
         $value = implode(":", $line_parts);
         //notice we lower case field, so switch below is case insensitive
         $field = strtolower(trim($field));
         $value = trim($value);
         $specificness = 0;
         if (strlen($value) == 0) {
             continue;
         }
         switch ($field) {
             case "user-agent":
                 //we allow * in user agent string
                 $summary[self::AGENT_LIST][] = $value;
                 $current_specificness = strcmp($value, USER_AGENT_SHORT) == 0 ? 1 : 0;
                 if ($current_specificness < $specificness) {
                     break;
                 }
                 if ($specificness < $current_specificness) {
                     //Give precedence to exact match on agent string
                     $specificness = $current_specificness;
                     $add_rule_state = true;
                     $summary[self::ROBOT_PATHS] = array();
                     break;
                 }
                 $agent_parts = explode("*", $value);
                 $offset = 0;
                 $add_rule_state = true;
                 foreach ($agent_parts as $part) {
                     if ($part == "") {
                         continue;
                     }
                     $new_offset = stripos(USER_AGENT_SHORT, $part, $offset);
                     if ($new_offset === false) {
                         $add_rule_state = false;
                         break;
                     }
                     $offset = $new_offset;
                 }
                 break;
             case "sitemap":
                 $tmp_url = UrlParser::canonicalLink($value, $host_url);
                 if (!UrlParser::checkRecursiveUrl($tmp_url) && strlen($tmp_url) < MAX_URL_LEN) {
                     $summary[self::LINKS][] = $tmp_url;
                 }
                 break;
             case "allow":
                 if ($add_rule_state) {
                     $rule_added_flag = true;
                     $summary[self::ROBOT_PATHS][self::ALLOWED_SITES][] = $this->makeCanonicalRobotPath($value);
                 }
                 break;
             case "disallow":
                 if ($add_rule_state) {
                     $rule_added_flag = true;
                     $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = $this->makeCanonicalRobotPath($value);
                 }
                 break;
             case "crawl-delay":
                 if ($add_rule_state) {
                     $delay_flag = true;
                     $delay = max($delay, intval($value));
                 }
                 break;
         }
     }
     if ($delay_flag) {
         if ($delay > MAXIMUM_CRAWL_DELAY) {
             $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = "/";
         } else {
             $summary[self::CRAWL_DELAY] = $delay;
         }
     }
     $summary[self::PAGE] = "<html><body><pre>" . strip_tags($page) . "</pre></body></html>";
     return $summary;
 }
예제 #4
0
 /**
  * Returns up to MAX_LINK_PER_PAGE many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom a document object with links on it
  * @param string $sit  a string containing a url
  *
  * @return array links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $hyperlink = "http://schemas.openxmlformats.org/officeDocument/2006/" . "relationships/hyperlink";
     $i = 0;
     $relationships = $dom->getElementsByTagName("Relationships");
     foreach ($relationships as $relationship) {
         $relations = $relationship->getElementsByTagName("Relationship");
         foreach ($relations as $relation) {
             if (strcmp($relation->getAttribute('Type'), $hyperlink) == 0) {
                 if ($i < MAX_LINKS_TO_EXTRACT) {
                     $link = $relation->getAttribute('Target');
                     $url = UrlParser::canonicalLink($link, $site);
                     if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN) {
                         if (isset($sites[$url])) {
                             $sites[$url] .= " " . $link;
                         } else {
                             $sites[$url] = $link;
                         }
                         $i++;
                     }
                 }
             }
         }
     }
     return $sites;
 }
예제 #5
0
 /**
  * Returns up to MAX_LINK_PER_PAGE many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom a document object with links on it
  * @param string $site a string containing a url
  *
  * @return array links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $paras = $xpath->evaluate("/p:sld//p:cSld//p:spTree//p:sp//\n            p:txBody//a:p//a:r//a:rPr//a:hlinkClick");
     $i = 0;
     foreach ($paras as $para) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $hlink = $para->parentNode->parentNode->getElementsByTagName("t")->item(0)->nodeValue;
             $url = UrlParser::canonicalLink($hlink, $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN && $len > 0) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " " . $hlink;
                 } else {
                     $sites[$url] = $hlink;
                 }
             }
         }
         $i++;
     }
     return $sites;
 }
예제 #6
0
 /**
  * Returns a url text pair where the url comes from the link of
  * the given item node and the text comes from the text data for that node.
  * urls are canonicalized according to site.
  *
  * @param object $item_node the DOMNode to get a link and text from
  * @param string $link_name name of link tag
  * @param string $text_name name of text tag to associate with link
  * @param string $site   a string containing a url
  * @param bool $atom if the feed is atom or rss
  *
  * @return array a url,text pair
  */
 static function linkAndTexts($item_node, $link_name, $text_name, $site, $atom = false)
 {
     foreach ($item_node->childNodes as $node) {
         if ($node->nodeName == $link_name) {
             if (!$atom) {
                 $url = UrlParser::canonicalLink($node->textContent, $site);
             } else {
                 $url = UrlParser::canonicalLink($node->getAttribute("href"), $site);
             }
             if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || strlen($url) >= MAX_URL_LEN) {
                 return false;
             }
         }
         if ($node->nodeName == $text_name) {
             $text = $node->textContent;
             if ($text == "") {
                 $text = "RSS Feed";
                 if ($atom) {
                     $text = "Atom Feed";
                 }
             }
         }
     }
     if (!isset($url) || $url == "") {
         return false;
     }
     $text = mb_ereg_replace("(\\s)+", " ", $text);
     return array($url, $text);
 }