Esempi in PHP per UrlParser::checkRecursiveUrl

Linguaggio di programmazione: PHP

Classe/tipologia: UrlParser

Metodo/funzione: checkRecursiveUrl

Esempi su hotexamples.com: 6

UrlParser::checkRecursiveUrl in PHP: 6 esempi trovati. Questi sono i migliori esempi reali in PHP per UrlParser::checkRecursiveUrl, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

canonicalLink(12)

getHost(11)

getDocumentType(7)

checkRecursiveUrl(6)

getDocumentFilename(5)

getPath(4)

urlMemberSiteArray(4)

getScheme(4)

getWordsLastPathPartUrl(3)

getWordsIfHostUrl(3)

getPathArray(3)

isLocalhostUrl(3)

isPathMemberRegexPaths(3)

simplifyUrl(3)

getHostAndPath(2)

pruneLinks(2)

guessMimeTypeFromFileName(2)

getHostSubdomains(2)

parse(1)

parseUrl(1)

isVideoUrl(1)

getPort(1)

isFollowUrl(1)

getLang(1)

getHostPaths(1)

getCourseDirName(1)

defaultFilter(1)

cleanRedundantLinks(1)

urlParse(1)

Esempio n. 1

Mostra file

File: html_processor.php Progetto: yakar/yioop

 /**
  * Returns up to MAX_LINKS_TO_EXTRACT many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $base_refs = $xpath->evaluate("/html//base");
     if ($base_refs->item(0)) {
         $tmp_site = $base_refs->item(0)->getAttribute('href');
         if (strlen($tmp_site) > 0) {
             $site = UrlParser::canonicalLink($tmp_site, $site);
         }
     }
     $i = 0;
     $hrefs = $xpath->evaluate("/html/body//a");
     foreach ($hrefs as $href) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $rel = $href->getAttribute("rel");
             if ($rel == "" || !stristr($rel, "nofollow")) {
                 $url = UrlParser::canonicalLink($href->getAttribute('href'), $site);
                 $len = strlen($url);
                 if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                     $text = $href->nodeValue;
                     if (isset($sites[$url])) {
                         $sites[$url] .= " .. " . preg_replace("/\\s+/", " ", strip_tags($text));
                         $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                     } else {
                         $sites[$url] = preg_replace("/\\s+/", " ", strip_tags($text));
                         $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                     }
                     $i++;
                 }
             }
         }
     }
     $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe");
     foreach ($frames as $frame) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $url = UrlParser::canonicalLink($frame->getAttribute('src'), $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " .. HTMLframe";
                 } else {
                     $sites[$url] = "HTMLframe";
                 }
                 $i++;
             }
         }
     }
     $imgs = $xpath->evaluate("/html/body//img[@alt]");
     $i = 0;
     foreach ($imgs as $img) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $alt = $img->getAttribute('alt');
             if (strlen($alt) < 1) {
                 continue;
             }
             $url = UrlParser::canonicalLink($img->getAttribute('src'), $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " .. " . $alt;
                     $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                 } else {
                     $sites[$url] = $alt;
                     $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                 }
                 $i++;
             }
         }
     }
     return $sites;
 }

Esempio n. 2

Mostra file

File: sitemap_processor.php Progetto: yakar/yioop

 /**
  * Returns links from the supplied dom object of a sitemap
  * where links have been canonicalized according to
  * the supplied $site information. We allow more links from a sitemap
  * than from other kinds of documents. For now we are ignoring weighting
  * info
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $xpath->registerNamespace('s', "http://www.sitemaps.org/schemas/sitemap/0.9");
     $paths = array("/s:urlset/s:url/s:loc", "/s:sitemapindex/s:sitemap/s:loc");
     $i = 0;
     foreach ($paths as $path) {
         $nodes = @$xpath->evaluate($path);
         foreach ($nodes as $node) {
             $url = UrlParser::canonicalLink($node->textContent, $site);
             if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || UrlParser::getDocumentType($url) == "gz" || strlen($url) >= MAX_URL_LEN) {
                 //at this point we can't handle gzip'd sitemaps
                 continue;
             }
             $sites[$url] = "From sitemap of " . $site;
             $i++;
             if ($i > MAX_LINKS_PER_SITEMAP) {
                 break 2;
             }
         }
     }
     return $sites;
 }

Esempio n. 3

Mostra file

File: robot_processor.php Progetto: yakar/yioop

 /**
  * Parses the contents of a robots.txt page extracting allowed,
  * disallowed paths, crawl-delay, and sitemaps. We also extract a
  * list of all user agent strings seen.
  *
  * @param string $page text string of a document
  * @param string $url location the document came from, not used by
  *     TextProcessor at this point. Some of its subclasses override
  *     this method and use url to produce complete links for
  *     relative links within a document
  *
  * @return array a summary of (title, description, links, and content) of
  *     the information in $page
  */
 function process($page, $url)
 {
     $summary = NULL;
     $summary[self::TITLE] = "";
     $summary[self::DESCRIPTION] = "";
     $summary[self::LANG] = NULL;
     $summary[self::ROBOT_PATHS] = array();
     $summary[self::AGENT_LIST] = array();
     $summary[self::LINKS] = array();
     $host_url = UrlParser::getHost($url);
     $lines = explode("\n", $page);
     $add_rule_state = false;
     $rule_added_flag = false;
     $delay_flag = false;
     $delay = 0;
     foreach ($lines as $pre_line) {
         $pre_line_parts = explode("#", $pre_line);
         $line = $pre_line_parts[0];
         $line_parts = explode(":", $line);
         if (!isset($line_parts[1])) {
             continue;
         }
         $field = array_shift($line_parts);
         $value = implode(":", $line_parts);
         //notice we lower case field, so switch below is case insensitive
         $field = strtolower(trim($field));
         $value = trim($value);
         $specificness = 0;
         if (strlen($value) == 0) {
             continue;
         }
         switch ($field) {
             case "user-agent":
                 //we allow * in user agent string
                 $summary[self::AGENT_LIST][] = $value;
                 $current_specificness = strcmp($value, USER_AGENT_SHORT) == 0 ? 1 : 0;
                 if ($current_specificness < $specificness) {
                     break;
                 }
                 if ($specificness < $current_specificness) {
                     //Give precedence to exact match on agent string
                     $specificness = $current_specificness;
                     $add_rule_state = true;
                     $summary[self::ROBOT_PATHS] = array();
                     break;
                 }
                 $agent_parts = explode("*", $value);
                 $offset = 0;
                 $add_rule_state = true;
                 foreach ($agent_parts as $part) {
                     if ($part == "") {
                         continue;
                     }
                     $new_offset = stripos(USER_AGENT_SHORT, $part, $offset);
                     if ($new_offset === false) {
                         $add_rule_state = false;
                         break;
                     }
                     $offset = $new_offset;
                 }
                 break;
             case "sitemap":
                 $tmp_url = UrlParser::canonicalLink($value, $host_url);
                 if (!UrlParser::checkRecursiveUrl($tmp_url) && strlen($tmp_url) < MAX_URL_LEN) {
                     $summary[self::LINKS][] = $tmp_url;
                 }
                 break;
             case "allow":
                 if ($add_rule_state) {
                     $rule_added_flag = true;
                     $summary[self::ROBOT_PATHS][self::ALLOWED_SITES][] = $this->makeCanonicalRobotPath($value);
                 }
                 break;
             case "disallow":
                 if ($add_rule_state) {
                     $rule_added_flag = true;
                     $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = $this->makeCanonicalRobotPath($value);
                 }
                 break;
             case "crawl-delay":
                 if ($add_rule_state) {
                     $delay_flag = true;
                     $delay = max($delay, intval($value));
                 }
                 break;
         }
     }
     if ($delay_flag) {
         if ($delay > MAXIMUM_CRAWL_DELAY) {
             $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = "/";
         } else {
             $summary[self::CRAWL_DELAY] = $delay;
         }
     }
     $summary[self::PAGE] = "<html><body><pre>" . strip_tags($page) . "</pre></body></html>";
     return $summary;
 }

Esempio n. 4

Mostra file

File: xlsx_processor.php Progetto: yakar/yioop

 /**
  * Returns up to MAX_LINK_PER_PAGE many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom a document object with links on it
  * @param string $sit  a string containing a url
  *
  * @return array links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $hyperlink = "http://schemas.openxmlformats.org/officeDocument/2006/" . "relationships/hyperlink";
     $i = 0;
     $relationships = $dom->getElementsByTagName("Relationships");
     foreach ($relationships as $relationship) {
         $relations = $relationship->getElementsByTagName("Relationship");
         foreach ($relations as $relation) {
             if (strcmp($relation->getAttribute('Type'), $hyperlink) == 0) {
                 if ($i < MAX_LINKS_TO_EXTRACT) {
                     $link = $relation->getAttribute('Target');
                     $url = UrlParser::canonicalLink($link, $site);
                     if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN) {
                         if (isset($sites[$url])) {
                             $sites[$url] .= " " . $link;
                         } else {
                             $sites[$url] = $link;
                         }
                         $i++;
                     }
                 }
             }
         }
     }
     return $sites;
 }

Esempio n. 5

Mostra file

File: pptx_processor.php Progetto: yakar/yioop

 /**
  * Returns up to MAX_LINK_PER_PAGE many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom a document object with links on it
  * @param string $site a string containing a url
  *
  * @return array links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $paras = $xpath->evaluate("/p:sld//p:cSld//p:spTree//p:sp//\n            p:txBody//a:p//a:r//a:rPr//a:hlinkClick");
     $i = 0;
     foreach ($paras as $para) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $hlink = $para->parentNode->parentNode->getElementsByTagName("t")->item(0)->nodeValue;
             $url = UrlParser::canonicalLink($hlink, $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN && $len > 0) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " " . $hlink;
                 } else {
                     $sites[$url] = $hlink;
                 }
             }
         }
         $i++;
     }
     return $sites;
 }

Esempio n. 6

Mostra file

File: rss_processor.php Progetto: yakar/yioop

 /**
  * Returns a url text pair where the url comes from the link of
  * the given item node and the text comes from the text data for that node.
  * urls are canonicalized according to site.
  *
  * @param object $item_node the DOMNode to get a link and text from
  * @param string $link_name name of link tag
  * @param string $text_name name of text tag to associate with link
  * @param string $site   a string containing a url
  * @param bool $atom if the feed is atom or rss
  *
  * @return array a url,text pair
  */
 static function linkAndTexts($item_node, $link_name, $text_name, $site, $atom = false)
 {
     foreach ($item_node->childNodes as $node) {
         if ($node->nodeName == $link_name) {
             if (!$atom) {
                 $url = UrlParser::canonicalLink($node->textContent, $site);
             } else {
                 $url = UrlParser::canonicalLink($node->getAttribute("href"), $site);
             }
             if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || strlen($url) >= MAX_URL_LEN) {
                 return false;
             }
         }
         if ($node->nodeName == $text_name) {
             $text = $node->textContent;
             if ($text == "") {
                 $text = "RSS Feed";
                 if ($atom) {
                     $text = "Atom Feed";
                 }
             }
         }
     }
     if (!isset($url) || $url == "") {
         return false;
     }
     $text = mb_ereg_replace("(\\s)+", " ", $text);
     return array($url, $text);
 }