Ejemplos de UrlParser::getDocumentType en PHP

Lenguaje de programación: PHP

Clase / Tipo: UrlParser

Método / Función: getDocumentType

Ejemplos en hotexamples.com: 7

PHP UrlParser::getDocumentType - 7 ejemplos encontrados. Estos son los ejemplos en PHP del mundo real mejor valorados de UrlParser::getDocumentType extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

canonicalLink(12)

getHost(11)

getDocumentType(7)

checkRecursiveUrl(6)

getDocumentFilename(5)

getPath(4)

urlMemberSiteArray(4)

getScheme(4)

getWordsLastPathPartUrl(3)

getWordsIfHostUrl(3)

getPathArray(3)

isLocalhostUrl(3)

isPathMemberRegexPaths(3)

simplifyUrl(3)

getHostAndPath(2)

pruneLinks(2)

guessMimeTypeFromFileName(2)

getHostSubdomains(2)

parse(1)

parseUrl(1)

isVideoUrl(1)

getPort(1)

isFollowUrl(1)

getLang(1)

getHostPaths(1)

getCourseDirName(1)

defaultFilter(1)

cleanRedundantLinks(1)

urlParse(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: queue_server.php Proyecto: yakar/yioop

 /**
  * Checks if url belongs to a list of sites that are allowed to be
  * crawled and that the file type is crawlable
  *
  * @param string $url url to check
  * @return bool whether is allowed to be crawled or not
  */
 function allowedToCrawlSite($url)
 {
     $doc_type = UrlParser::getDocumentType($url);
     if (!in_array($doc_type, $this->all_file_types)) {
         $doc_type = "unknown";
     }
     if (!in_array($doc_type, $this->indexed_file_types)) {
         return false;
     }
     if ($this->restrict_sites_by_url) {
         return UrlParser::urlMemberSiteArray($url, $this->allowed_sites, "a" . $this->allow_disallow_cache_time);
     }
     return true;
 }

Ejemplo n.º 2

Mostrar archivo

Archivo: url_parser.php Proyecto: yakar/yioop

 /**
  * Guess mime type based on extension of the file
  *
  * @param string $file_name name of the file
  * @return string $mime_type for the given file name
  */
 static function guessMimeTypeFromFileName($file_name)
 {
     $mime_type_map = array("bmp" => 'image/bmp', "doc" => 'application/msword', "epub" => 'application/epub+zip', "gif" => 'image/gif', "asp" => 'text/asp', "aspx" => 'text/asp', 'cgi' => 'text/html', "cfm" => 'text/html', "cfml" => 'text/html', "do" => 'text/html', "htm" => 'text/html', "html" => 'text/html', "jsp" => 'text/html', "php" => 'text/html', "pl" => 'text/html', "java" => 'text/java', "py" => 'text/py', "shtml" => 'text/html', "jpg" => 'image/jpeg', "jpeg" => 'image/jpeg', "pdf" => 'application/pdf', "png" => 'image/png', "ppt" => 'application/vnd.ms-powerpoint', "pptx" => 'application/vnd.openxmlformats-officedocument.' . 'presentationml.presentation', "rss" => 'application/rss+xml', "rtf" => 'text/rtf', "svg" => 'image/svg+xml', "csv" => 'text/csv', "tab" => 'text/tab-separated-values', "tsv" => 'text/x-java-source', "txt" => 'text/plain', "xlsx" => 'application/vnd.openxmlformats-officedocument.' . 'spreadsheetml.sheet', "xml" => 'text/gitxml', "js" => 'text/plain', "c" => 'text/plain', "cc" => 'text/plain', "cs" => 'text/plain');
     $extension = UrlParser::getDocumentType($file_name);
     if (isset($mime_type_map[$extension])) {
         $mime_type = $mime_type_map[$extension];
     } else {
         $mime_type = "text/plain";
     }
     return $mime_type;
 }

Ejemplo n.º 3

Mostrar archivo

Archivo: sitemap_processor.php Proyecto: yakar/yioop

 /**
  * Returns links from the supplied dom object of a sitemap
  * where links have been canonicalized according to
  * the supplied $site information. We allow more links from a sitemap
  * than from other kinds of documents. For now we are ignoring weighting
  * info
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $xpath->registerNamespace('s', "http://www.sitemaps.org/schemas/sitemap/0.9");
     $paths = array("/s:urlset/s:url/s:loc", "/s:sitemapindex/s:sitemap/s:loc");
     $i = 0;
     foreach ($paths as $path) {
         $nodes = @$xpath->evaluate($path);
         foreach ($nodes as $node) {
             $url = UrlParser::canonicalLink($node->textContent, $site);
             if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || UrlParser::getDocumentType($url) == "gz" || strlen($url) >= MAX_URL_LEN) {
                 //at this point we can't handle gzip'd sitemaps
                 continue;
             }
             $sites[$url] = "From sitemap of " . $site;
             $i++;
             if ($i > MAX_LINKS_PER_SITEMAP) {
                 break 2;
             }
         }
     }
     return $sites;
 }

Ejemplo n.º 4

Mostrar archivo

Archivo: java_processor.php Proyecto: yakar/yioop

 /**
  * Tries to determine the language of the document by looking at the
  * $sample_text and $url provided
  * the language
  * @param string $sample_text sample text to try guess the language from
  * @param string $url url of web-page as a fallback look at the country
  *     to figure out language
  *
  * @return string language tag for guessed language
  */
 static function calculateLang($sample_text = NULL, $url = NULL)
 {
     if ($url != NULL) {
         $lang = UrlParser::getDocumentType($url);
     }
     return $lang;
 }

Ejemplo n.º 5

Mostrar archivo

Archivo: phrase_parser.php Proyecto: yakar/yioop

 /**
  * Used to compute all the meta ids for a given link with $url
  * and $link_text that was on a site with $site_url.
  *
  * @param string $url url of the link
  * @param string $link_host url of the host name of the link
  * @param string $link_text text of the anchor tag link came from
  * @param string $site_url url of the page link was on
  */
 static function calculateLinkMetas($url, $link_host, $link_text, $site_url)
 {
     global $IMAGE_TYPES;
     $link_meta_ids = array();
     if (strlen($link_host) == 0) {
         continue;
     }
     if (substr($link_text, 0, 9) == "location:") {
         $location_link = true;
         $link_meta_ids[] = $link_text;
         $link_meta_ids[] = "location:all";
         $link_meta_ids[] = "location:" . crawlHash($site_url);
     }
     $link_type = UrlParser::getDocumentType($url);
     $link_meta_ids[] = "media:all";
     $link_meta_ids[] = "safe:all";
     if (in_array($link_type, $IMAGE_TYPES)) {
         $link_meta_ids[] = "media:image";
         if (isset($safe) && !$safe) {
             $link_meta_ids[] = "safe:false";
         }
     } else {
         $link_meta_ids[] = "media:text";
     }
     $link_meta_ids[] = "link:all";
     return $link_meta_ids;
 }

Ejemplo n.º 6

Mostrar archivo

Archivo: fetcher.php Proyecto: yakar/yioop

 /**
  * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT
  * This method attempts to cull from the doc_info struct the
  * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing
  * links which of filetype or sites the crawler is forbidden from crawl.
  * Then a crude estimate of the informaation contained in the links test:
  * strlen(gzip(text)) is used to extract the best remaining links.
  *
  * @param array& $doc_info a string with a CrawlConstants::LINKS subarray
  * This subarray in turn contains url => text pairs.
  * @param string $field field for links default is CrawlConstants::LINKS
  * @param int $member_cache_time says how long allowed and disallowed url
  *      info should be caches by urlMemberSiteArray
  */
 function pruneLinks(&$doc_info, $field = CrawlConstants::LINKS, $member_cache_time = 0)
 {
     if (!isset($doc_info[self::LINKS])) {
         return;
     }
     $links = array();
     $allowed_name = "a" . $member_cache_time;
     $disallowed_name = "d" . $member_cache_time;
     foreach ($doc_info[$field] as $url => $text) {
         $doc_type = UrlParser::getDocumentType($url);
         if (!in_array($doc_type, $this->all_file_types)) {
             $doc_type = "unknown";
         }
         if (!in_array($doc_type, $this->indexed_file_types)) {
             continue;
         }
         if ($this->restrict_sites_by_url) {
             if (!UrlParser::urlMemberSiteArray($url, $this->allowed_sites, $allowed_name)) {
                 continue;
             }
         }
         if (UrlParser::urlMemberSiteArray($url, $this->disallowed_sites, $disallowed_name)) {
             continue;
         }
         $links[$url] = $text;
     }
     $doc_info[$field] = UrlParser::pruneLinks($links);
 }

Ejemplo n.º 7

Mostrar archivo

Archivo: resource_controller.php Proyecto: yakar/yioop

 /**
  * Gets the resource $_REQUEST['n'] from APP_DIR/$_REQUEST['f'] or
  * CRAWL_DIR/$_REQUEST['f']  after cleaning
  */
 function get()
 {
     if (!isset($_REQUEST['n']) || !isset($_REQUEST['f'])) {
         return;
     }
     $name = $this->clean($_REQUEST['n'], "string");
     if (in_array($_REQUEST['f'], array("css", "scripts", "resources"))) {
         /* notice in this case we don't check if request come from a
               legitimate source but we do try to restrict it to being
               a file (not a folder) in the above array
            */
         $base_dir = $this->getBaseFolder();
         if (!$base_dir) {
             header('HTTP/1.1 401 Unauthorized');
             echo "<html><head><title>401 Unauthorized</title></head>" . "<body><p>401 Unauthorized</p></body></html>";
             return;
         }
         $type = UrlParser::getDocumentType($name);
         $name = UrlParser::getDocumentFilename($name);
         $name = $type != "" ? "{$name}.{$type}" : $name;
         if (isset($_REQUEST['t'])) {
             $name .= ".jpg";
         }
     } else {
         if (in_array($_REQUEST['f'], array("cache"))) {
             /*  perform check since these request should come from a known
                     machine
                 */
             if (!$this->checkRequest()) {
                 return;
             }
             $folder = $_REQUEST['f'];
             $base_dir = CRAWL_DIR . "/{$folder}";
         } else {
             return;
         }
     }
     if (isset($_REQUEST['o']) && isset($_REQUEST['l'])) {
         $offset = $this->clean($_REQUEST['o'], "int");
         $limit = $this->clean($_REQUEST['l'], "int");
     }
     $path = "{$base_dir}/{$name}";
     if (file_exists($path)) {
         $mime_type = mimeType($path);
         $size = filesize($path);
         $start = 0;
         $end = $size - 1;
         header("Content-type: {$mime_type}");
         header("Accept-Ranges: bytes");
         if (isset($_SERVER['HTTP_RANGE'])) {
             $this->serveRangeRequest($path, $size, $start, $end);
             return;
         }
         header("Content-Length: " . $size);
         header("Content-Range: bytes {$start}-{$end}/{$size}");
         if (isset($offset) && isset($limit)) {
             echo file_get_contents($path, false, NULL, $offset, $limit);
         } else {
             readfile($path);
         }
     } else {
         header("Location:./error.php");
     }
 }