Esempi in PHP per UrlParser::getDocumentFilename

Linguaggio di programmazione: PHP

Classe/tipologia: UrlParser

Metodo/funzione: getDocumentFilename

Esempi su hotexamples.com: 5

UrlParser::getDocumentFilename in PHP: 5 esempi trovati. Questi sono i migliori esempi reali in PHP per UrlParser::getDocumentFilename, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

canonicalLink(12)

getHost(11)

getDocumentType(7)

checkRecursiveUrl(6)

getDocumentFilename(5)

getPath(4)

urlMemberSiteArray(4)

getScheme(4)

getWordsLastPathPartUrl(3)

getWordsIfHostUrl(3)

getPathArray(3)

isLocalhostUrl(3)

isPathMemberRegexPaths(3)

simplifyUrl(3)

getHostAndPath(2)

pruneLinks(2)

guessMimeTypeFromFileName(2)

getHostSubdomains(2)

parse(1)

parseUrl(1)

isVideoUrl(1)

getPort(1)

isFollowUrl(1)

getLang(1)

getHostPaths(1)

getCourseDirName(1)

defaultFilter(1)

cleanRedundantLinks(1)

urlParse(1)

Esempio n. 1

Mostra file

File: bmp_processor.php Progetto: yakar/yioop

 /**
  * {@inheritDoc}
  *
  * @param string $page  the image represented as a character string
  * @param string $url  the url where the image was downloaded from
  * @return array summary information including a thumbnail and a
  *     description (where the description is just the url)
  */
 function process($page, $url)
 {
     if (is_string($page)) {
         $image = $this->imagecreatefrombmp($page);
         $thumb_string = self::createThumb($image);
         $summary[self::TITLE] = "";
         $summary[self::DESCRIPTION] = "Image of " . UrlParser::getDocumentFilename($url);
         $summary[self::LINKS] = array();
         $summary[self::PAGE] = "<html><body><div><img src='data:image/bmp;base64," . base64_encode($page) . "' alt='" . $summary[self::DESCRIPTION] . "' /></div></body></html>";
         $summary[self::THUMB] = 'data:image/jpeg;base64,' . base64_encode($thumb_string);
     }
     return $summary;
 }

Esempio n. 2

Mostra file

File: fetcher.php Progetto: yakar/yioop

 /**
  * Prepare an array of up to NUM_MULTI_CURL_PAGES' worth of sites to be
  * downloaded in one go using the to_crawl array. Delete these sites
  * from the to_crawl array.
  *
  * @return array sites which are ready to be downloaded
  */
 function getFetchSites()
 {
     $web_archive = $this->web_archive;
     $start_time = microtime();
     $seeds = array();
     $delete_indices = array();
     $num_items = count($this->to_crawl);
     if ($num_items > 0) {
         $crawl_source =& $this->to_crawl;
         $to_crawl_flag = true;
     } else {
         crawlLog("...Trying to crawl sites which failed the first time");
         $num_items = count($this->to_crawl_again);
         $crawl_source =& $this->to_crawl_again;
         $to_crawl_flag = false;
     }
     reset($crawl_source);
     if ($num_items > NUM_MULTI_CURL_PAGES) {
         $num_items = NUM_MULTI_CURL_PAGES;
     }
     //DNS lookups take longer so try to get fewer in one go
     $num_ip_lookups = max($num_items / 3, 2);
     $i = 0;
     $ip_lookup_cnt = 0;
     $site_pair = each($crawl_source);
     while ($site_pair !== false && $i < $num_items && $ip_lookup_cnt < $num_ip_lookups) {
         $delete_indices[] = $site_pair['key'];
         if ($site_pair['value'][0] != self::DUMMY) {
             $host = UrlParser::getHost($site_pair['value'][0]);
             if (!strpos($site_pair['value'][0], "###")) {
                 $ip_lookup_cnt++;
             }
             // only download if host doesn't seem congested
             if (!isset($this->hosts_with_errors[$host]) || $this->hosts_with_errors[$host] < DOWNLOAD_ERROR_THRESHOLD) {
                 $url_to_check = $site_pair['value'][0];
                 $extension = UrlParser::getDocumentType($url_to_check);
                 $repository_indicator = FetchGitRepositoryUrls::checkForRepository($extension);
                 if ($repository_indicator == self::REPOSITORY_GIT) {
                     $git_internal_urls = FetchGitRepositoryUrls::setGitRepositoryUrl($url_to_check, $i, $seeds, $repository_indicator, $site_pair, $this->total_git_urls, $this->all_git_urls);
                     $i = $git_internal_urls['position'];
                     $git_url_index = $git_internal_urls['index'];
                     $seeds = $git_internal_urls['seeds'];
                     $repository_indicator = $git_internal_urls['indicator'];
                     $this->total_git_urls = $git_internal_urls['count'];
                     $this->all_git_urls = $git_internal_urls['all'];
                 } else {
                     $seeds[$i][self::URL] = $site_pair['value'][0];
                     $seeds[$i][self::WEIGHT] = $site_pair['value'][1];
                     $seeds[$i][self::CRAWL_DELAY] = $site_pair['value'][2];
                 }
                 /*
                   Crawl delay is only used in scheduling on the queue_server
                   on the fetcher, we only use crawl-delay to determine
                   if we will give a page a second try if it doesn't
                   download the first time
                 */
                 if (UrlParser::getDocumentFilename($seeds[$i][self::URL]) . "." . UrlParser::getDocumentType($seeds[$i][self::URL]) == "robots.txt") {
                     $seeds[$i][self::ROBOT_PATHS] = array();
                 }
                 $i++;
             }
         } else {
             break;
         }
         $site_pair = each($crawl_source);
     }
     //end while
     foreach ($delete_indices as $delete_index) {
         $git_set = false;
         if ($to_crawl_flag == true) {
             $extension = UrlParser::getDocumentType($this->to_crawl[$delete_index][0]);
             $repository_type = FetchGitRepositoryUrls::checkForRepository($extension);
             if ($repository_type != self::REPOSITORY_GIT) {
                 unset($this->to_crawl[$delete_index]);
             }
         } else {
             $extension = UrlParser::getDocumentType($this->to_crawl_again[$delete_index][0]);
             $repository_type = FetchGitRepositoryUrls::checkForRepository($extension);
             unset($this->to_crawl_again[$delete_index]);
         }
         if ($repository_type == self::REPOSITORY_GIT) {
             if (!$git_set) {
                 $next_url_start = $url_to_check . self::GIT_URL_CONTINUE . $git_url_index;
                 $git_set = true;
                 $this->to_crawl[$delete_index][0] = $next_url_start;
             }
             if ($repository_indicator == self::INDICATOR_NONE) {
                 unset($this->to_crawl[$delete_index]);
             }
         }
     }
     crawlLog("Fetch url list to download time " . changeInMicrotime($start_time));
     return $seeds;
 }

Esempio n. 3

Mostra file

File: fetch_url.php Progetto: yakar/yioop

 /**
  * Make multi_curl requests for an array of sites with urls or onion urls
  *
  * @param array $sites  an array containing urls of pages to request
  * @param bool $timer  flag, true means print timing statistics to log
  * @param int $page_range_request maximum number of bytes to download/page
  *     0 means download all
  * @param string $temp_dir folder to store temporary ip header info
  * @param string $key  the component of $sites[$i] that has the value of
  *     a url to get defaults to URL
  * @param string $value component of $sites[$i] in which to store the
  *     page that was gotten
  * @param bool $minimal if true do a faster request of pages by not
  *     doing things like extract HTTP headers sent, etcs
  * @param array $post_data data to be POST'd to each site
  * @param bool $follow whether to follow redirects or not
  * @param string $tor_proxy url of a proxy that knows how to download
  *     .onion urls
  * @param array $proxy_servers if not array(), then an array of proxy
  *     server to use rather than to directly download web pages from
  *     the current machine
  *
  * @return array an updated array with the contents of those pages
  */
 static function getPages($sites, $timer = false, $page_range_request = PAGE_RANGE_REQUEST, $temp_dir = NULL, $key = CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal = false, $post_data = NULL, $follow = false, $tor_proxy = "", $proxy_servers = array())
 {
     $agent_handler = curl_multi_init();
     $active = NULL;
     $start_time = microtime();
     if (!$minimal && $temp_dir == NULL) {
         $temp_dir = CRAWL_DIR . "/temp";
         if (!file_exists($temp_dir)) {
             mkdir($temp_dir);
         }
     }
     //Set-up requests
     $num_sites = count($sites);
     for ($i = 0; $i < $num_sites; $i++) {
         $is_gopher = false;
         $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
         if (isset($sites[$i][$key])) {
             list($sites[$i][$key], $url, $headers) = self::prepareUrlHeaders($sites[$i][$key], $minimal, $proxy_servers);
             if ($headers == "gopher") {
                 $is_gopher = true;
                 $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
                 $headers = array();
             }
             $sites[$i][0] = curl_init();
             if (!$minimal) {
                 $ip_holder[$i] = fopen("{$temp_dir}/tmp{$i}.txt", 'w+');
                 curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]);
                 curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
             }
             curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT);
             curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE, CURL_IPRESOLVE_WHATEVER);
             curl_setopt($sites[$i][0], CURLOPT_URL, $url);
             if (strcmp(substr($url, -10), "robots.txt") == 0) {
                 $sites[$i]['ROBOT'] = true;
                 $follow = true;
                 /*wikipedia redirects their robot page. grr
                     want to force this for robots pages
                   */
             }
             curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow);
             curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0);
             curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true);
             curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true);
             curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT);
             curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT);
             if (stripos($url, '.onion') !== false && $tor_proxy != "") {
                 curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy);
                 //CURLPROXY_SOCKS5_HOSTNAME = 7
                 curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7);
                 if ($timer) {
                     crawlLog("Using Tor proxy for {$url}..");
                 }
             } else {
                 if ($proxy_servers != array() && !$is_gopher) {
                     $select_proxy = rand(0, count($proxy_servers) - 1);
                     $proxy_server = $proxy_servers[$select_proxy];
                     $proxy_parts = explode(":", $proxy_server);
                     $proxy_ip = $proxy_parts[0];
                     if (!isset($proxy_parts[2]) || strtolower($proxy_parts[2]) == 'http') {
                         $proxy_type = CURLPROXY_HTTP;
                     } else {
                         if (strtolower($proxy_parts[2]) == 'socks5') {
                             $proxy_type = CURLPROXY_SOCKS5;
                         } else {
                             $proxy_type = $proxy_parts[2];
                         }
                     }
                     if (isset($proxy_parts[1])) {
                         $proxy_port = $proxy_parts[1];
                     } else {
                         $proxy_port = "80";
                     }
                     curl_setopt($sites[$i][0], CURLOPT_PROXY, "{$proxy_ip}:{$proxy_port}");
                     curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, $proxy_type);
                     if ($timer) {
                         crawlLog("Selecting proxy {$select_proxy} for {$url}");
                     }
                 }
             }
             if (!$minimal) {
                 curl_setopt($sites[$i][0], CURLOPT_HEADER, true);
             }
             //make lighttpd happier
             if (!$is_gopher) {
                 curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER, $headers);
             }
             curl_setopt($sites[$i][0], CURLOPT_ENCODING, "");
             // ^ need to set for sites like att that use gzip
             if ($page_range_request > 0) {
                 curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-" . $page_range_request);
             }
             if ($post_data != NULL) {
                 curl_setopt($sites[$i][0], CURLOPT_POST, true);
                 curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS, $post_data[$i]);
             }
             curl_multi_add_handle($agent_handler, $sites[$i][0]);
         }
     }
     if ($timer) {
         crawlLog("  Init Get Pages " . changeInMicrotime($start_time));
     }
     $start_time = microtime();
     $start = time();
     //Wait for responses
     $running = NULL;
     $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7;
     do {
         $mrc = curl_multi_exec($agent_handler, $running);
         $ready = curl_multi_select($agent_handler, 0.005);
     } while (memory_get_usage() < $memory_limit && time() - $start < PAGE_TIMEOUT && $running > 0);
     if (time() - $start > PAGE_TIMEOUT && $timer) {
         crawlLog("  TIMED OUT!!!");
     }
     if ($timer) {
         crawlLog("  Page Request time " . changeInMicrotime($start_time));
     }
     $start_time = microtime();
     //Process returned pages
     for ($i = 0; $i < $num_sites; $i++) {
         if ($timer) {
             crawlTimeoutLog("fetch_url initial processing of page %s of %s", $i, $num_sites);
         }
         if (!$minimal && isset($ip_holder[$i])) {
             rewind($ip_holder[$i]);
             $header = fread($ip_holder[$i], 8192);
             $ip_addresses = self::getCurlIp($header);
             fclose($ip_holder[$i]);
         }
         $is_gopher = false;
         if (isset($sites[$i][0]) && $sites[$i][0]) {
             // Get Data and Message Code
             $content = @curl_multi_getcontent($sites[$i][0]);
             $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL];
             /*
                If the Transfer-encoding was chunked then the Range header
                we sent was ignored. So we manually truncate the data
                here
             */
             if ($page_range_request > 0) {
                 $content = substr($content, 0, $page_range_request);
             }
             if (isset($content) && !$minimal && !$is_gopher) {
                 $site = self::parseHeaderPage($content, $value);
                 $sites[$i] = array_merge($sites[$i], $site);
                 if (isset($header)) {
                     $header = substr($header, 0, strpos($header, "\r\n\r\n") + 4);
                 } else {
                     $header = "";
                 }
                 $sites[$i][CrawlConstants::HEADER] = $header . $sites[$i][CrawlConstants::HEADER];
                 unset($header);
             } else {
                 if (isset($content) && !$minimal && $is_gopher) {
                     $sites[$i][CrawlConstants::HEADER] = $header;
                     $sites[$i][$value] = $content;
                     unset($header);
                 } else {
                     $sites[$i][$value] = $content;
                 }
             }
             if (!$minimal) {
                 $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0], CURLINFO_SIZE_DOWNLOAD);
                 $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_NAMELOOKUP_TIME);
                 $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_TOTAL_TIME);
                 $sites[$i][self::HTTP_CODE] = curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE);
                 if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) {
                     $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]);
                 } else {
                     $sites[$i][self::HTTP_CODE] = 200;
                 }
                 if ($ip_addresses) {
                     $sites[$i][self::IP_ADDRESSES] = $ip_addresses;
                 } else {
                     $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0");
                 }
                 //Get Time, Mime type and Character encoding
                 $sites[$i][self::TIMESTAMP] = time();
                 if ($is_gopher) {
                     $path = UrlParser::getPath($sites[$i][self::URL]);
                     $filename = UrlParser::getDocumentFilename($sites[$i][self::URL]);
                     if (isset($path[1])) {
                         $gopher_type = $path[1];
                     } else {
                         $gopher_type = 1;
                     }
                     if ($gopher_type == 1) {
                         $sites[$i][self::TYPE] = "text/gopher";
                     } else {
                         if (in_array($gopher_type, array(0, 3, 6))) {
                             $sites[$i][self::TYPE] = "text/plain";
                             if ($gopher_type == 6) {
                                 $sites[$i][$value] = convert_uudecode($content);
                             }
                         } else {
                             if ($gopher_type == 'h') {
                                 $sites[$i][self::TYPE] = "text/html";
                             } else {
                                 if ($gopher_type == 'g') {
                                     $sites[$i][self::TYPE] = "image/gif";
                                 }
                             }
                         }
                     }
                     $path_info = pathinfo($filename);
                     if (!isset($sites[$i][self::TYPE]) && isset($path_info['extension'])) {
                         $sites[$i][self::TYPE] = UrlParser::guessMimeTypeFromFileName($filename);
                     } else {
                         if (!isset($sites[$i][self::TYPE])) {
                             $sites[$i][self::TYPE] = "unknown";
                         }
                     }
                 } else {
                     $type_parts = explode(";", curl_getinfo($sites[$i][0], CURLINFO_CONTENT_TYPE));
                     $sites[$i][self::TYPE] = strtolower(trim($type_parts[0]));
                 }
             }
             //curl_multi_remove_handle($agent_handler, $sites[$i][0]);
             curl_close($sites[$i][0]);
             if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) {
                 if (isset($sites[$i][self::TYPE]) && $sites[$i][self::TYPE] != "text/plain" && isset($sites[$i][CrawlConstants::LOCATION]) && count($site[CrawlConstants::LOCATION]) > 0) {
                     $sites[$i][self::TYPE] = "text/plain";
                     $sites[$i][self::HTTP_CODE] = "200";
                     $tmp = wordwrap($sites[$i][$value], 80);
                     $tmp_parts = explode("\n", $tmp);
                     $tmp = "# Suspect server misconfiguration\n";
                     $tmp .= "# Assume shouldn't crawl this site.\n";
                     $tmp .= "# Pretending got following robots.txt.\n";
                     $tmp .= "User-agent: *\n";
                     $tmp .= "Disallow: /\n";
                     $tmp .= "# Original error code: " . $sites[$i][self::HTTP_CODE] . "\n";
                     $tmp .= "# Original content:\n";
                     foreach ($tmp_parts as $part) {
                         $tmp = "#" . $part . "\n";
                     }
                     $sites[$i][$value] = $tmp;
                     $sites[$i][self::HTTP_CODE] = "200";
                     unset($site[CrawlConstants::LOCATION]);
                 }
             }
         }
         //end big if
     }
     //end for
     if ($timer) {
         crawlLog("  Get Page Content time " . changeInMicrotime($start_time));
     }
     curl_multi_close($agent_handler);
     return $sites;
 }

Esempio n. 4

Mostra file

File: arc_tool.php Progetto: yakar/yioop

 /**
  * Used to recompute the dictionary of an index archive -- either from
  * scratch using the index shard data or just using the current dictionary
  * but merging the tiers into one tier
  *
  * @param string $path file path to dictionary of an IndexArchiveBundle
  * @param int $max_tier tier up to which the dictionary tiers should be
  *     merge (typically a value greater than the max_tier of the
  *     dictionary)
  */
 function reindexIndexArchive($path, $max_tier = -1)
 {
     if ($this->getArchiveKind($path) != "IndexArchiveBundle") {
         echo "\n{$path} ...\n" . "  is not an IndexArchiveBundle so cannot be re-indexed\n\n";
         exit;
     }
     $shards = glob($path . "/posting_doc_shards/index*");
     if (is_array($shards)) {
         if ($max_tier == -1) {
             $dbms_manager = DBMS . "Manager";
             $db = new $dbms_manager();
             $db->unlinkRecursive($path . "/dictionary", false);
             IndexDictionary::makePrefixLetters($path . "/dictionary");
         }
         $dictionary = new IndexDictionary($path . "/dictionary");
         if ($max_tier == -1) {
             $max_generation = 0;
             foreach ($shards as $shard_name) {
                 $file_name = UrlParser::getDocumentFilename($shard_name);
                 $generation = (int) substr($file_name, strlen("index"));
                 $max_generation = max($max_generation, $generation);
             }
             for ($i = 0; $i < $max_generation + 1; $i++) {
                 $shard_name = $path . "/posting_doc_shards/index{$i}";
                 echo "\nShard {$i}\n";
                 $shard = new IndexShard($shard_name, $i, NUM_DOCS_PER_GENERATION, true);
                 $dictionary->addShardDictionary($shard);
             }
             $max_tier = $dictionary->max_tier;
         }
         echo "\nFinal Merge Tiers\n";
         $dictionary->mergeAllTiers(NULL, $max_tier);
         $db->setWorldPermissionsRecursive($path . "/dictionary");
         echo "\nReindex complete!!\n";
     } else {
         echo "\n{$path} ...\n" . "  does not contain posting shards so cannot be re-indexed\n\n";
     }
 }

Esempio n. 5

Mostra file

File: resource_controller.php Progetto: yakar/yioop

 /**
  * Gets the resource $_REQUEST['n'] from APP_DIR/$_REQUEST['f'] or
  * CRAWL_DIR/$_REQUEST['f']  after cleaning
  */
 function get()
 {
     if (!isset($_REQUEST['n']) || !isset($_REQUEST['f'])) {
         return;
     }
     $name = $this->clean($_REQUEST['n'], "string");
     if (in_array($_REQUEST['f'], array("css", "scripts", "resources"))) {
         /* notice in this case we don't check if request come from a
               legitimate source but we do try to restrict it to being
               a file (not a folder) in the above array
            */
         $base_dir = $this->getBaseFolder();
         if (!$base_dir) {
             header('HTTP/1.1 401 Unauthorized');
             echo "<html><head><title>401 Unauthorized</title></head>" . "<body><p>401 Unauthorized</p></body></html>";
             return;
         }
         $type = UrlParser::getDocumentType($name);
         $name = UrlParser::getDocumentFilename($name);
         $name = $type != "" ? "{$name}.{$type}" : $name;
         if (isset($_REQUEST['t'])) {
             $name .= ".jpg";
         }
     } else {
         if (in_array($_REQUEST['f'], array("cache"))) {
             /*  perform check since these request should come from a known
                     machine
                 */
             if (!$this->checkRequest()) {
                 return;
             }
             $folder = $_REQUEST['f'];
             $base_dir = CRAWL_DIR . "/{$folder}";
         } else {
             return;
         }
     }
     if (isset($_REQUEST['o']) && isset($_REQUEST['l'])) {
         $offset = $this->clean($_REQUEST['o'], "int");
         $limit = $this->clean($_REQUEST['l'], "int");
     }
     $path = "{$base_dir}/{$name}";
     if (file_exists($path)) {
         $mime_type = mimeType($path);
         $size = filesize($path);
         $start = 0;
         $end = $size - 1;
         header("Content-type: {$mime_type}");
         header("Accept-Ranges: bytes");
         if (isset($_SERVER['HTTP_RANGE'])) {
             $this->serveRangeRequest($path, $size, $start, $end);
             return;
         }
         header("Content-Length: " . $size);
         header("Content-Range: bytes {$start}-{$end}/{$size}");
         if (isset($offset) && isset($limit)) {
             echo file_get_contents($path, false, NULL, $offset, $limit);
         } else {
             readfile($path);
         }
     } else {
         header("Location:./error.php");
     }
 }