Ejemplo n.º 1
0
 /**
  * Checks if url belongs to a list of sites that are allowed to be
  * crawled and that the file type is crawlable
  *
  * @param string $url url to check
  * @return bool whether is allowed to be crawled or not
  */
 function allowedToCrawlSite($url)
 {
     $doc_type = UrlParser::getDocumentType($url);
     if (!in_array($doc_type, $this->all_file_types)) {
         $doc_type = "unknown";
     }
     if (!in_array($doc_type, $this->indexed_file_types)) {
         return false;
     }
     if ($this->restrict_sites_by_url) {
         return UrlParser::urlMemberSiteArray($url, $this->allowed_sites, "a" . $this->allow_disallow_cache_time);
     }
     return true;
 }
Ejemplo n.º 2
0
 /**
  * Guess mime type based on extension of the file
  *
  * @param string $file_name name of the file
  * @return string $mime_type for the given file name
  */
 static function guessMimeTypeFromFileName($file_name)
 {
     $mime_type_map = array("bmp" => 'image/bmp', "doc" => 'application/msword', "epub" => 'application/epub+zip', "gif" => 'image/gif', "asp" => 'text/asp', "aspx" => 'text/asp', 'cgi' => 'text/html', "cfm" => 'text/html', "cfml" => 'text/html', "do" => 'text/html', "htm" => 'text/html', "html" => 'text/html', "jsp" => 'text/html', "php" => 'text/html', "pl" => 'text/html', "java" => 'text/java', "py" => 'text/py', "shtml" => 'text/html', "jpg" => 'image/jpeg', "jpeg" => 'image/jpeg', "pdf" => 'application/pdf', "png" => 'image/png', "ppt" => 'application/vnd.ms-powerpoint', "pptx" => 'application/vnd.openxmlformats-officedocument.' . 'presentationml.presentation', "rss" => 'application/rss+xml', "rtf" => 'text/rtf', "svg" => 'image/svg+xml', "csv" => 'text/csv', "tab" => 'text/tab-separated-values', "tsv" => 'text/x-java-source', "txt" => 'text/plain', "xlsx" => 'application/vnd.openxmlformats-officedocument.' . 'spreadsheetml.sheet', "xml" => 'text/gitxml', "js" => 'text/plain', "c" => 'text/plain', "cc" => 'text/plain', "cs" => 'text/plain');
     $extension = UrlParser::getDocumentType($file_name);
     if (isset($mime_type_map[$extension])) {
         $mime_type = $mime_type_map[$extension];
     } else {
         $mime_type = "text/plain";
     }
     return $mime_type;
 }
Ejemplo n.º 3
0
 /**
  * Returns links from the supplied dom object of a sitemap
  * where links have been canonicalized according to
  * the supplied $site information. We allow more links from a sitemap
  * than from other kinds of documents. For now we are ignoring weighting
  * info
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $xpath->registerNamespace('s', "http://www.sitemaps.org/schemas/sitemap/0.9");
     $paths = array("/s:urlset/s:url/s:loc", "/s:sitemapindex/s:sitemap/s:loc");
     $i = 0;
     foreach ($paths as $path) {
         $nodes = @$xpath->evaluate($path);
         foreach ($nodes as $node) {
             $url = UrlParser::canonicalLink($node->textContent, $site);
             if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || UrlParser::getDocumentType($url) == "gz" || strlen($url) >= MAX_URL_LEN) {
                 //at this point we can't handle gzip'd sitemaps
                 continue;
             }
             $sites[$url] = "From sitemap of " . $site;
             $i++;
             if ($i > MAX_LINKS_PER_SITEMAP) {
                 break 2;
             }
         }
     }
     return $sites;
 }
Ejemplo n.º 4
0
 /**
  * Tries to determine the language of the document by looking at the
  * $sample_text and $url provided
  * the language
  * @param string $sample_text sample text to try guess the language from
  * @param string $url url of web-page as a fallback look at the country
  *     to figure out language
  *
  * @return string language tag for guessed language
  */
 static function calculateLang($sample_text = NULL, $url = NULL)
 {
     if ($url != NULL) {
         $lang = UrlParser::getDocumentType($url);
     }
     return $lang;
 }
Ejemplo n.º 5
0
 /**
  * Used to compute all the meta ids for a given link with $url
  * and $link_text that was on a site with $site_url.
  *
  * @param string $url url of the link
  * @param string $link_host url of the host name of the link
  * @param string $link_text text of the anchor tag link came from
  * @param string $site_url url of the page link was on
  */
 static function calculateLinkMetas($url, $link_host, $link_text, $site_url)
 {
     global $IMAGE_TYPES;
     $link_meta_ids = array();
     if (strlen($link_host) == 0) {
         continue;
     }
     if (substr($link_text, 0, 9) == "location:") {
         $location_link = true;
         $link_meta_ids[] = $link_text;
         $link_meta_ids[] = "location:all";
         $link_meta_ids[] = "location:" . crawlHash($site_url);
     }
     $link_type = UrlParser::getDocumentType($url);
     $link_meta_ids[] = "media:all";
     $link_meta_ids[] = "safe:all";
     if (in_array($link_type, $IMAGE_TYPES)) {
         $link_meta_ids[] = "media:image";
         if (isset($safe) && !$safe) {
             $link_meta_ids[] = "safe:false";
         }
     } else {
         $link_meta_ids[] = "media:text";
     }
     $link_meta_ids[] = "link:all";
     return $link_meta_ids;
 }
Ejemplo n.º 6
0
 /**
  * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT
  * This method attempts to cull from the doc_info struct the
  * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing
  * links which of filetype or sites the crawler is forbidden from crawl.
  * Then a crude estimate of the informaation contained in the links test:
  * strlen(gzip(text)) is used to extract the best remaining links.
  *
  * @param array& $doc_info a string with a CrawlConstants::LINKS subarray
  * This subarray in turn contains url => text pairs.
  * @param string $field field for links default is CrawlConstants::LINKS
  * @param int $member_cache_time says how long allowed and disallowed url
  *      info should be caches by urlMemberSiteArray
  */
 function pruneLinks(&$doc_info, $field = CrawlConstants::LINKS, $member_cache_time = 0)
 {
     if (!isset($doc_info[self::LINKS])) {
         return;
     }
     $links = array();
     $allowed_name = "a" . $member_cache_time;
     $disallowed_name = "d" . $member_cache_time;
     foreach ($doc_info[$field] as $url => $text) {
         $doc_type = UrlParser::getDocumentType($url);
         if (!in_array($doc_type, $this->all_file_types)) {
             $doc_type = "unknown";
         }
         if (!in_array($doc_type, $this->indexed_file_types)) {
             continue;
         }
         if ($this->restrict_sites_by_url) {
             if (!UrlParser::urlMemberSiteArray($url, $this->allowed_sites, $allowed_name)) {
                 continue;
             }
         }
         if (UrlParser::urlMemberSiteArray($url, $this->disallowed_sites, $disallowed_name)) {
             continue;
         }
         $links[$url] = $text;
     }
     $doc_info[$field] = UrlParser::pruneLinks($links);
 }
Ejemplo n.º 7
0
 /**
  * Gets the resource $_REQUEST['n'] from APP_DIR/$_REQUEST['f'] or
  * CRAWL_DIR/$_REQUEST['f']  after cleaning
  */
 function get()
 {
     if (!isset($_REQUEST['n']) || !isset($_REQUEST['f'])) {
         return;
     }
     $name = $this->clean($_REQUEST['n'], "string");
     if (in_array($_REQUEST['f'], array("css", "scripts", "resources"))) {
         /* notice in this case we don't check if request come from a
               legitimate source but we do try to restrict it to being
               a file (not a folder) in the above array
            */
         $base_dir = $this->getBaseFolder();
         if (!$base_dir) {
             header('HTTP/1.1 401 Unauthorized');
             echo "<html><head><title>401 Unauthorized</title></head>" . "<body><p>401 Unauthorized</p></body></html>";
             return;
         }
         $type = UrlParser::getDocumentType($name);
         $name = UrlParser::getDocumentFilename($name);
         $name = $type != "" ? "{$name}.{$type}" : $name;
         if (isset($_REQUEST['t'])) {
             $name .= ".jpg";
         }
     } else {
         if (in_array($_REQUEST['f'], array("cache"))) {
             /*  perform check since these request should come from a known
                     machine
                 */
             if (!$this->checkRequest()) {
                 return;
             }
             $folder = $_REQUEST['f'];
             $base_dir = CRAWL_DIR . "/{$folder}";
         } else {
             return;
         }
     }
     if (isset($_REQUEST['o']) && isset($_REQUEST['l'])) {
         $offset = $this->clean($_REQUEST['o'], "int");
         $limit = $this->clean($_REQUEST['l'], "int");
     }
     $path = "{$base_dir}/{$name}";
     if (file_exists($path)) {
         $mime_type = mimeType($path);
         $size = filesize($path);
         $start = 0;
         $end = $size - 1;
         header("Content-type: {$mime_type}");
         header("Accept-Ranges: bytes");
         if (isset($_SERVER['HTTP_RANGE'])) {
             $this->serveRangeRequest($path, $size, $start, $end);
             return;
         }
         header("Content-Length: " . $size);
         header("Content-Range: bytes {$start}-{$end}/{$size}");
         if (isset($offset) && isset($limit)) {
             echo file_get_contents($path, false, NULL, $offset, $limit);
         } else {
             readfile($path);
         }
     } else {
         header("Location:./error.php");
     }
 }