PHP UrlParser::isPathMemberRegexPaths Examples

Programming Language: PHP

Class/Type: UrlParser

Method/Function: isPathMemberRegexPaths

Examples at hotexamples.com: 3

PHP UrlParser::isPathMemberRegexPaths - 3 examples found. These are the top rated real world PHP examples of UrlParser::isPathMemberRegexPaths extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

canonicalLink(12)

getHost(11)

getDocumentType(7)

checkRecursiveUrl(6)

getDocumentFilename(5)

getPath(4)

urlMemberSiteArray(4)

getScheme(4)

getWordsLastPathPartUrl(3)

getWordsIfHostUrl(3)

getPathArray(3)

isLocalhostUrl(3)

isPathMemberRegexPaths(3)

simplifyUrl(3)

getHostAndPath(2)

pruneLinks(2)

guessMimeTypeFromFileName(2)

getHostSubdomains(2)

parse(1)

parseUrl(1)

isVideoUrl(1)

getPort(1)

isFollowUrl(1)

getLang(1)

getHostPaths(1)

getCourseDirName(1)

defaultFilter(1)

cleanRedundantLinks(1)

urlParse(1)

Example #1

Show file

File: url_parser_test.php Project: yakar/yioop

 /**
  * Check is a path matches with a list of paths presumably coming from
  * a robots.txt file
  */
 function isPathMemberRegexPathsTestCase()
 {
     $path = array();
     $robot_paths = array();
     $results = array();
     $tests = array(array("/bobby", array("/bob"), true, "Substring Positive"), array("/bobby", array("/alice", "/f/g/h/d"), false, "Substring Negative 1"), array("/bobby/", array("/bobby/bay", "/f/g/h/d", "/yo"), false, "Substring Negative 2"), array("/bay/bobby/", array("/bobby/", "/f/g/h/d", "/yo"), false, "Substring Negative 3 (should match start)"), array("http://test.com/bay/bobby/", array("/bobby/", "/f/g/h/d", "/yo"), false, "Substring Negative 4 (should match start)"), array("/a/bbbb/c/", array("/bobby/bay", "/a/*/c/", "/yo"), true, "Star Positive 1"), array("/a/bbbb/d/", array("/bobby/bay", "/a/*/c/", "/yo"), false, "Star Negative 1"), array("/test.html?a=b", array("/bobby/bay", "/*?", "/yo"), true, "Star Positive 2"), array("/test.html", array("/bobby/bay", "/*.html\$", "/yo"), true, "Dollar Positive 1"), array("/test.htmlish", array("/bobby/bay", "/*.html\$", "/yo"), false, "Dollar Negative 1"), array("/test.htmlish", array("/bobby/bay", "*", "/yo"), true, "Degenerate 1"), array("/test.html", array("/bobby/bay", "/**.html\$", "/yo"), true, "Degenerate 2"), array("/videos/search?q=Angelina+Jolie", array("/videos/search?"), true, "End With Question Regex Case 1"));
     foreach ($tests as $test) {
         list($path, $robot_paths, $result, $description) = $test;
         $this->assertEqual(UrlParser::isPathMemberRegexPaths($path, $robot_paths), $result, $description);
     }
 }

Example #2

Show file

File: url_parser.php Project: yakar/yioop

 /**
  * Checks if the url belongs to one of the sites listed in site_array
  * Sites can be either given in the form domain:host or
  * in the form of a url in which case it is check that the site url
  * is a substring of the passed url.
  *
  * @param string $url url to check
  * @param array $site_array sites to check against
  * @param string $name identifier to store $site_array with in this
  *     function's cache
  * @param bool $return_rule whether when a match is found to return true or
  *     to return the matching site rule
  * @return mixed whether the url belongs to one of the sites
  */
 static function urlMemberSiteArray($url, $site_array, $name, $return_rule = false)
 {
     static $cache = array();
     if (!is_array($site_array)) {
         return false;
     }
     if (!isset($cache[$name])) {
         if (count($cache) > 100) {
             $cache = array();
         }
         $i = 0;
         $cache[$name]["domains"] = array();
         $cache[$name]["hosts"] = array();
         $cache[$name]["paths"] = array();
         $cache[$name]["sites"] = array();
         foreach ($site_array as $site) {
             if (strncmp($site, "domain:", 7) == 0) {
                 $cache[$name]["domains"][] = substr($site, 7);
                 continue;
             }
             list($site_host, $site_path) = UrlParser::getHostAndPath($site, true, true);
             $cache[$name]["hosts"][] = $site_host;
             $cache[$name]["paths"][] = $site_path;
             $cache[$name]["sites"][] = $site;
             $i++;
         }
         $cache[$name]["domains"] = array_values(array_unique($cache[$name]["domains"]));
     }
     $flag = false;
     $domains =& $cache[$name]["domains"];
     $hosts =& $cache[$name]["hosts"];
     $paths =& $cache[$name]["paths"];
     $sites =& $cache[$name]["sites"];
     list($host, $path) = UrlParser::getHostAndPath($url, true, true);
     foreach ($domains as $domain) {
         $pos = strrpos($host, $domain);
         if ($pos !== false && $pos + strlen($domain) == strlen($host)) {
             if ($return_rule) {
                 return "domain:{$domain}";
             }
             return true;
         }
     }
     $count = count($sites);
     for ($i = 0; $i < $count; $i++) {
         $flag = UrlParser::isPathMemberRegexPaths($host, array($hosts[$i]));
         if (!$flag) {
             continue;
         }
         $flag = UrlParser::isPathMemberRegexPaths($path, array($paths[$i]));
         if ($flag) {
             break;
         }
     }
     if ($return_rule && $flag) {
         $flag = $sites[$i];
     }
     return $flag;
 }

Example #3

Show file

File: web_queue_bundle.php Project: yakar/yioop

 /**
  * Checks if the given $url is allowed to be crawled based on stored
  * robots.txt info.
  * @param string $url to check
  * @return bool whether it was allowed or not
  */
 function checkRobotOkay($url)
 {
     // local cache of recent robot.txt stuff
     static $robot_cache = array();
     $cache_size = 2000;
     list($host, $path) = UrlParser::getHostAndPath($url, true, true);
     $path = urldecode($path);
     $key = crawlHash($host, true);
     if (isset($robot_cache[$key])) {
         $robot_object = $robot_cache[$key];
     } else {
         $data = $this->robot_table->lookup($key);
         $offset = unpackInt($data);
         $robot_object = $this->robot_archive->getObjects($offset, 1);
         $robot_cache[$key] = $robot_object;
         if (count($robot_cache) > $cache_size) {
             array_shift($robot_cache);
         }
     }
     $robot_paths = isset($robot_object[0][1]) ? $robot_object[0][1] : array();
     //these should have been urldecoded in RobotProcessor
     $robots_okay = true;
     $robots_not_okay = false;
     if (isset($robot_paths[CrawlConstants::DISALLOWED_SITES])) {
         $robots_not_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::DISALLOWED_SITES]);
         $robots_okay = !$robots_not_okay;
     }
     if (isset($robot_paths[CrawlConstants::ALLOWED_SITES])) {
         $robots_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::ALLOWED_SITES]);
     }
     return $robots_okay || !$robots_not_okay;
 }