Esempi in PHP per UrlParser::getDocumentType

Linguaggio di programmazione: PHP

Classe/tipologia: UrlParser

Metodo/funzione: getDocumentType

Esempi su hotexamples.com: 7

UrlParser::getDocumentType in PHP: 7 esempi trovati. Questi sono i migliori esempi reali in PHP per UrlParser::getDocumentType, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

canonicalLink(12)

getHost(11)

getDocumentType(7)

checkRecursiveUrl(6)

getDocumentFilename(5)

getPath(4)

urlMemberSiteArray(4)

getScheme(4)

getWordsLastPathPartUrl(3)

getWordsIfHostUrl(3)

getPathArray(3)

isLocalhostUrl(3)

isPathMemberRegexPaths(3)

simplifyUrl(3)

getHostAndPath(2)

pruneLinks(2)

guessMimeTypeFromFileName(2)

getHostSubdomains(2)

parse(1)

parseUrl(1)

isVideoUrl(1)

getPort(1)

isFollowUrl(1)

getLang(1)

getHostPaths(1)

getCourseDirName(1)

defaultFilter(1)

cleanRedundantLinks(1)

urlParse(1)

Esempio n. 1

Mostra file

File: queue_server.php Progetto: yakar/yioop

 /**
  * Checks if url belongs to a list of sites that are allowed to be
  * crawled and that the file type is crawlable
  *
  * @param string $url url to check
  * @return bool whether is allowed to be crawled or not
  */
 function allowedToCrawlSite($url)
 {
     $doc_type = UrlParser::getDocumentType($url);
     if (!in_array($doc_type, $this->all_file_types)) {
         $doc_type = "unknown";
     }
     if (!in_array($doc_type, $this->indexed_file_types)) {
         return false;
     }
     if ($this->restrict_sites_by_url) {
         return UrlParser::urlMemberSiteArray($url, $this->allowed_sites, "a" . $this->allow_disallow_cache_time);
     }
     return true;
 }

Esempio n. 2

Mostra file

File: url_parser.php Progetto: yakar/yioop

 /**
  * Guess mime type based on extension of the file
  *
  * @param string $file_name name of the file
  * @return string $mime_type for the given file name
  */
 static function guessMimeTypeFromFileName($file_name)
 {
     $mime_type_map = array("bmp" => 'image/bmp', "doc" => 'application/msword', "epub" => 'application/epub+zip', "gif" => 'image/gif', "asp" => 'text/asp', "aspx" => 'text/asp', 'cgi' => 'text/html', "cfm" => 'text/html', "cfml" => 'text/html', "do" => 'text/html', "htm" => 'text/html', "html" => 'text/html', "jsp" => 'text/html', "php" => 'text/html', "pl" => 'text/html', "java" => 'text/java', "py" => 'text/py', "shtml" => 'text/html', "jpg" => 'image/jpeg', "jpeg" => 'image/jpeg', "pdf" => 'application/pdf', "png" => 'image/png', "ppt" => 'application/vnd.ms-powerpoint', "pptx" => 'application/vnd.openxmlformats-officedocument.' . 'presentationml.presentation', "rss" => 'application/rss+xml', "rtf" => 'text/rtf', "svg" => 'image/svg+xml', "csv" => 'text/csv', "tab" => 'text/tab-separated-values', "tsv" => 'text/x-java-source', "txt" => 'text/plain', "xlsx" => 'application/vnd.openxmlformats-officedocument.' . 'spreadsheetml.sheet', "xml" => 'text/gitxml', "js" => 'text/plain', "c" => 'text/plain', "cc" => 'text/plain', "cs" => 'text/plain');
     $extension = UrlParser::getDocumentType($file_name);
     if (isset($mime_type_map[$extension])) {
         $mime_type = $mime_type_map[$extension];
     } else {
         $mime_type = "text/plain";
     }
     return $mime_type;
 }

Esempio n. 3

Mostra file

File: sitemap_processor.php Progetto: yakar/yioop

 /**
  * Returns links from the supplied dom object of a sitemap
  * where links have been canonicalized according to
  * the supplied $site information. We allow more links from a sitemap
  * than from other kinds of documents. For now we are ignoring weighting
  * info
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $xpath->registerNamespace('s', "http://www.sitemaps.org/schemas/sitemap/0.9");
     $paths = array("/s:urlset/s:url/s:loc", "/s:sitemapindex/s:sitemap/s:loc");
     $i = 0;
     foreach ($paths as $path) {
         $nodes = @$xpath->evaluate($path);
         foreach ($nodes as $node) {
             $url = UrlParser::canonicalLink($node->textContent, $site);
             if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || UrlParser::getDocumentType($url) == "gz" || strlen($url) >= MAX_URL_LEN) {
                 //at this point we can't handle gzip'd sitemaps
                 continue;
             }
             $sites[$url] = "From sitemap of " . $site;
             $i++;
             if ($i > MAX_LINKS_PER_SITEMAP) {
                 break 2;
             }
         }
     }
     return $sites;
 }

Esempio n. 4

Mostra file

File: java_processor.php Progetto: yakar/yioop

 /**
  * Tries to determine the language of the document by looking at the
  * $sample_text and $url provided
  * the language
  * @param string $sample_text sample text to try guess the language from
  * @param string $url url of web-page as a fallback look at the country
  *     to figure out language
  *
  * @return string language tag for guessed language
  */
 static function calculateLang($sample_text = NULL, $url = NULL)
 {
     if ($url != NULL) {
         $lang = UrlParser::getDocumentType($url);
     }
     return $lang;
 }

Esempio n. 5

Mostra file

File: phrase_parser.php Progetto: yakar/yioop

 /**
  * Used to compute all the meta ids for a given link with $url
  * and $link_text that was on a site with $site_url.
  *
  * @param string $url url of the link
  * @param string $link_host url of the host name of the link
  * @param string $link_text text of the anchor tag link came from
  * @param string $site_url url of the page link was on
  */
 static function calculateLinkMetas($url, $link_host, $link_text, $site_url)
 {
     global $IMAGE_TYPES;
     $link_meta_ids = array();
     if (strlen($link_host) == 0) {
         continue;
     }
     if (substr($link_text, 0, 9) == "location:") {
         $location_link = true;
         $link_meta_ids[] = $link_text;
         $link_meta_ids[] = "location:all";
         $link_meta_ids[] = "location:" . crawlHash($site_url);
     }
     $link_type = UrlParser::getDocumentType($url);
     $link_meta_ids[] = "media:all";
     $link_meta_ids[] = "safe:all";
     if (in_array($link_type, $IMAGE_TYPES)) {
         $link_meta_ids[] = "media:image";
         if (isset($safe) && !$safe) {
             $link_meta_ids[] = "safe:false";
         }
     } else {
         $link_meta_ids[] = "media:text";
     }
     $link_meta_ids[] = "link:all";
     return $link_meta_ids;
 }

Esempio n. 6

Mostra file

File: fetcher.php Progetto: yakar/yioop

 /**
  * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT
  * This method attempts to cull from the doc_info struct the
  * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing
  * links which of filetype or sites the crawler is forbidden from crawl.
  * Then a crude estimate of the informaation contained in the links test:
  * strlen(gzip(text)) is used to extract the best remaining links.
  *
  * @param array& $doc_info a string with a CrawlConstants::LINKS subarray
  * This subarray in turn contains url => text pairs.
  * @param string $field field for links default is CrawlConstants::LINKS
  * @param int $member_cache_time says how long allowed and disallowed url
  *      info should be caches by urlMemberSiteArray
  */
 function pruneLinks(&$doc_info, $field = CrawlConstants::LINKS, $member_cache_time = 0)
 {
     if (!isset($doc_info[self::LINKS])) {
         return;
     }
     $links = array();
     $allowed_name = "a" . $member_cache_time;
     $disallowed_name = "d" . $member_cache_time;
     foreach ($doc_info[$field] as $url => $text) {
         $doc_type = UrlParser::getDocumentType($url);
         if (!in_array($doc_type, $this->all_file_types)) {
             $doc_type = "unknown";
         }
         if (!in_array($doc_type, $this->indexed_file_types)) {
             continue;
         }
         if ($this->restrict_sites_by_url) {
             if (!UrlParser::urlMemberSiteArray($url, $this->allowed_sites, $allowed_name)) {
                 continue;
             }
         }
         if (UrlParser::urlMemberSiteArray($url, $this->disallowed_sites, $disallowed_name)) {
             continue;
         }
         $links[$url] = $text;
     }
     $doc_info[$field] = UrlParser::pruneLinks($links);
 }

Esempio n. 7

Mostra file

File: resource_controller.php Progetto: yakar/yioop

 /**
  * Gets the resource $_REQUEST['n'] from APP_DIR/$_REQUEST['f'] or
  * CRAWL_DIR/$_REQUEST['f']  after cleaning
  */
 function get()
 {
     if (!isset($_REQUEST['n']) || !isset($_REQUEST['f'])) {
         return;
     }
     $name = $this->clean($_REQUEST['n'], "string");
     if (in_array($_REQUEST['f'], array("css", "scripts", "resources"))) {
         /* notice in this case we don't check if request come from a
               legitimate source but we do try to restrict it to being
               a file (not a folder) in the above array
            */
         $base_dir = $this->getBaseFolder();
         if (!$base_dir) {
             header('HTTP/1.1 401 Unauthorized');
             echo "<html><head><title>401 Unauthorized</title></head>" . "<body><p>401 Unauthorized</p></body></html>";
             return;
         }
         $type = UrlParser::getDocumentType($name);
         $name = UrlParser::getDocumentFilename($name);
         $name = $type != "" ? "{$name}.{$type}" : $name;
         if (isset($_REQUEST['t'])) {
             $name .= ".jpg";
         }
     } else {
         if (in_array($_REQUEST['f'], array("cache"))) {
             /*  perform check since these request should come from a known
                     machine
                 */
             if (!$this->checkRequest()) {
                 return;
             }
             $folder = $_REQUEST['f'];
             $base_dir = CRAWL_DIR . "/{$folder}";
         } else {
             return;
         }
     }
     if (isset($_REQUEST['o']) && isset($_REQUEST['l'])) {
         $offset = $this->clean($_REQUEST['o'], "int");
         $limit = $this->clean($_REQUEST['l'], "int");
     }
     $path = "{$base_dir}/{$name}";
     if (file_exists($path)) {
         $mime_type = mimeType($path);
         $size = filesize($path);
         $start = 0;
         $end = $size - 1;
         header("Content-type: {$mime_type}");
         header("Accept-Ranges: bytes");
         if (isset($_SERVER['HTTP_RANGE'])) {
             $this->serveRangeRequest($path, $size, $start, $end);
             return;
         }
         header("Content-Length: " . $size);
         header("Content-Range: bytes {$start}-{$end}/{$size}");
         if (isset($offset) && isset($limit)) {
             echo file_get_contents($path, false, NULL, $offset, $limit);
         } else {
             readfile($path);
         }
     } else {
         header("Location:./error.php");
     }
 }