/** * Returns up to MAX_LINKS_TO_EXTRACT many links from the supplied * dom object where links have been canonicalized according to * the supplied $site information. * * @param object $dom a document object with links on it * @param string $site a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $xpath = new DOMXPath($dom); $base_refs = $xpath->evaluate("/html//base"); if ($base_refs->item(0)) { $tmp_site = $base_refs->item(0)->getAttribute('href'); if (strlen($tmp_site) > 0) { $site = UrlParser::canonicalLink($tmp_site, $site); } } $i = 0; $hrefs = $xpath->evaluate("/html/body//a"); foreach ($hrefs as $href) { if ($i < MAX_LINKS_TO_EXTRACT) { $rel = $href->getAttribute("rel"); if ($rel == "" || !stristr($rel, "nofollow")) { $url = UrlParser::canonicalLink($href->getAttribute('href'), $site); $len = strlen($url); if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) { $text = $href->nodeValue; if (isset($sites[$url])) { $sites[$url] .= " .. " . preg_replace("/\\s+/", " ", strip_tags($text)); $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT); } else { $sites[$url] = preg_replace("/\\s+/", " ", strip_tags($text)); $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT); } $i++; } } } } $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe"); foreach ($frames as $frame) { if ($i < MAX_LINKS_TO_EXTRACT) { $url = UrlParser::canonicalLink($frame->getAttribute('src'), $site); $len = strlen($url); if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) { if (isset($sites[$url])) { $sites[$url] .= " .. HTMLframe"; } else { $sites[$url] = "HTMLframe"; } $i++; } } } $imgs = $xpath->evaluate("/html/body//img[@alt]"); $i = 0; foreach ($imgs as $img) { if ($i < MAX_LINKS_TO_EXTRACT) { $alt = $img->getAttribute('alt'); if (strlen($alt) < 1) { continue; } $url = UrlParser::canonicalLink($img->getAttribute('src'), $site); $len = strlen($url); if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) { if (isset($sites[$url])) { $sites[$url] .= " .. " . $alt; $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT); } else { $sites[$url] = $alt; $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT); } $i++; } } } return $sites; }
/** * Returns links from the supplied dom object of a sitemap * where links have been canonicalized according to * the supplied $site information. We allow more links from a sitemap * than from other kinds of documents. For now we are ignoring weighting * info * * @param object $dom a document object with links on it * @param string $site a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $xpath = new DOMXPath($dom); $xpath->registerNamespace('s', "http://www.sitemaps.org/schemas/sitemap/0.9"); $paths = array("/s:urlset/s:url/s:loc", "/s:sitemapindex/s:sitemap/s:loc"); $i = 0; foreach ($paths as $path) { $nodes = @$xpath->evaluate($path); foreach ($nodes as $node) { $url = UrlParser::canonicalLink($node->textContent, $site); if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || UrlParser::getDocumentType($url) == "gz" || strlen($url) >= MAX_URL_LEN) { //at this point we can't handle gzip'd sitemaps continue; } $sites[$url] = "From sitemap of " . $site; $i++; if ($i > MAX_LINKS_PER_SITEMAP) { break 2; } } } return $sites; }
/** * Parses the contents of a robots.txt page extracting allowed, * disallowed paths, crawl-delay, and sitemaps. We also extract a * list of all user agent strings seen. * * @param string $page text string of a document * @param string $url location the document came from, not used by * TextProcessor at this point. Some of its subclasses override * this method and use url to produce complete links for * relative links within a document * * @return array a summary of (title, description, links, and content) of * the information in $page */ function process($page, $url) { $summary = NULL; $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = ""; $summary[self::LANG] = NULL; $summary[self::ROBOT_PATHS] = array(); $summary[self::AGENT_LIST] = array(); $summary[self::LINKS] = array(); $host_url = UrlParser::getHost($url); $lines = explode("\n", $page); $add_rule_state = false; $rule_added_flag = false; $delay_flag = false; $delay = 0; foreach ($lines as $pre_line) { $pre_line_parts = explode("#", $pre_line); $line = $pre_line_parts[0]; $line_parts = explode(":", $line); if (!isset($line_parts[1])) { continue; } $field = array_shift($line_parts); $value = implode(":", $line_parts); //notice we lower case field, so switch below is case insensitive $field = strtolower(trim($field)); $value = trim($value); $specificness = 0; if (strlen($value) == 0) { continue; } switch ($field) { case "user-agent": //we allow * in user agent string $summary[self::AGENT_LIST][] = $value; $current_specificness = strcmp($value, USER_AGENT_SHORT) == 0 ? 1 : 0; if ($current_specificness < $specificness) { break; } if ($specificness < $current_specificness) { //Give precedence to exact match on agent string $specificness = $current_specificness; $add_rule_state = true; $summary[self::ROBOT_PATHS] = array(); break; } $agent_parts = explode("*", $value); $offset = 0; $add_rule_state = true; foreach ($agent_parts as $part) { if ($part == "") { continue; } $new_offset = stripos(USER_AGENT_SHORT, $part, $offset); if ($new_offset === false) { $add_rule_state = false; break; } $offset = $new_offset; } break; case "sitemap": $tmp_url = UrlParser::canonicalLink($value, $host_url); if (!UrlParser::checkRecursiveUrl($tmp_url) && strlen($tmp_url) < MAX_URL_LEN) { $summary[self::LINKS][] = $tmp_url; } break; case "allow": if ($add_rule_state) { $rule_added_flag = true; $summary[self::ROBOT_PATHS][self::ALLOWED_SITES][] = $this->makeCanonicalRobotPath($value); } break; case "disallow": if ($add_rule_state) { $rule_added_flag = true; $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = $this->makeCanonicalRobotPath($value); } break; case "crawl-delay": if ($add_rule_state) { $delay_flag = true; $delay = max($delay, intval($value)); } break; } } if ($delay_flag) { if ($delay > MAXIMUM_CRAWL_DELAY) { $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = "/"; } else { $summary[self::CRAWL_DELAY] = $delay; } } $summary[self::PAGE] = "<html><body><pre>" . strip_tags($page) . "</pre></body></html>"; return $summary; }
/** * Returns up to MAX_LINK_PER_PAGE many links from the supplied * dom object where links have been canonicalized according to * the supplied $site information. * * @param object $dom a document object with links on it * @param string $sit a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $hyperlink = "http://schemas.openxmlformats.org/officeDocument/2006/" . "relationships/hyperlink"; $i = 0; $relationships = $dom->getElementsByTagName("Relationships"); foreach ($relationships as $relationship) { $relations = $relationship->getElementsByTagName("Relationship"); foreach ($relations as $relation) { if (strcmp($relation->getAttribute('Type'), $hyperlink) == 0) { if ($i < MAX_LINKS_TO_EXTRACT) { $link = $relation->getAttribute('Target'); $url = UrlParser::canonicalLink($link, $site); if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN) { if (isset($sites[$url])) { $sites[$url] .= " " . $link; } else { $sites[$url] = $link; } $i++; } } } } } return $sites; }
/** * Returns up to MAX_LINK_PER_PAGE many links from the supplied * dom object where links have been canonicalized according to * the supplied $site information. * * @param object $dom a document object with links on it * @param string $site a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $xpath = new DOMXPath($dom); $paras = $xpath->evaluate("/p:sld//p:cSld//p:spTree//p:sp//\n p:txBody//a:p//a:r//a:rPr//a:hlinkClick"); $i = 0; foreach ($paras as $para) { if ($i < MAX_LINKS_TO_EXTRACT) { $hlink = $para->parentNode->parentNode->getElementsByTagName("t")->item(0)->nodeValue; $url = UrlParser::canonicalLink($hlink, $site); $len = strlen($url); if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN && $len > 0) { if (isset($sites[$url])) { $sites[$url] .= " " . $hlink; } else { $sites[$url] = $hlink; } } } $i++; } return $sites; }
/** * Returns a url text pair where the url comes from the link of * the given item node and the text comes from the text data for that node. * urls are canonicalized according to site. * * @param object $item_node the DOMNode to get a link and text from * @param string $link_name name of link tag * @param string $text_name name of text tag to associate with link * @param string $site a string containing a url * @param bool $atom if the feed is atom or rss * * @return array a url,text pair */ static function linkAndTexts($item_node, $link_name, $text_name, $site, $atom = false) { foreach ($item_node->childNodes as $node) { if ($node->nodeName == $link_name) { if (!$atom) { $url = UrlParser::canonicalLink($node->textContent, $site); } else { $url = UrlParser::canonicalLink($node->getAttribute("href"), $site); } if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || strlen($url) >= MAX_URL_LEN) { return false; } } if ($node->nodeName == $text_name) { $text = $node->textContent; if ($text == "") { $text = "RSS Feed"; if ($atom) { $text = "Atom Feed"; } } } } if (!isset($url) || $url == "") { return false; } $text = mb_ereg_replace("(\\s)+", " ", $text); return array($url, $text); }