/** * Checks if url belongs to a list of sites that are allowed to be * crawled and that the file type is crawlable * * @param string $url url to check * @return bool whether is allowed to be crawled or not */ function allowedToCrawlSite($url) { $doc_type = UrlParser::getDocumentType($url); if (!in_array($doc_type, $this->all_file_types)) { $doc_type = "unknown"; } if (!in_array($doc_type, $this->indexed_file_types)) { return false; } if ($this->restrict_sites_by_url) { return UrlParser::urlMemberSiteArray($url, $this->allowed_sites, "a" . $this->allow_disallow_cache_time); } return true; }
/** * Guess mime type based on extension of the file * * @param string $file_name name of the file * @return string $mime_type for the given file name */ static function guessMimeTypeFromFileName($file_name) { $mime_type_map = array("bmp" => 'image/bmp', "doc" => 'application/msword', "epub" => 'application/epub+zip', "gif" => 'image/gif', "asp" => 'text/asp', "aspx" => 'text/asp', 'cgi' => 'text/html', "cfm" => 'text/html', "cfml" => 'text/html', "do" => 'text/html', "htm" => 'text/html', "html" => 'text/html', "jsp" => 'text/html', "php" => 'text/html', "pl" => 'text/html', "java" => 'text/java', "py" => 'text/py', "shtml" => 'text/html', "jpg" => 'image/jpeg', "jpeg" => 'image/jpeg', "pdf" => 'application/pdf', "png" => 'image/png', "ppt" => 'application/vnd.ms-powerpoint', "pptx" => 'application/vnd.openxmlformats-officedocument.' . 'presentationml.presentation', "rss" => 'application/rss+xml', "rtf" => 'text/rtf', "svg" => 'image/svg+xml', "csv" => 'text/csv', "tab" => 'text/tab-separated-values', "tsv" => 'text/x-java-source', "txt" => 'text/plain', "xlsx" => 'application/vnd.openxmlformats-officedocument.' . 'spreadsheetml.sheet', "xml" => 'text/gitxml', "js" => 'text/plain', "c" => 'text/plain', "cc" => 'text/plain', "cs" => 'text/plain'); $extension = UrlParser::getDocumentType($file_name); if (isset($mime_type_map[$extension])) { $mime_type = $mime_type_map[$extension]; } else { $mime_type = "text/plain"; } return $mime_type; }
/** * Returns links from the supplied dom object of a sitemap * where links have been canonicalized according to * the supplied $site information. We allow more links from a sitemap * than from other kinds of documents. For now we are ignoring weighting * info * * @param object $dom a document object with links on it * @param string $site a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $xpath = new DOMXPath($dom); $xpath->registerNamespace('s', "http://www.sitemaps.org/schemas/sitemap/0.9"); $paths = array("/s:urlset/s:url/s:loc", "/s:sitemapindex/s:sitemap/s:loc"); $i = 0; foreach ($paths as $path) { $nodes = @$xpath->evaluate($path); foreach ($nodes as $node) { $url = UrlParser::canonicalLink($node->textContent, $site); if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || UrlParser::getDocumentType($url) == "gz" || strlen($url) >= MAX_URL_LEN) { //at this point we can't handle gzip'd sitemaps continue; } $sites[$url] = "From sitemap of " . $site; $i++; if ($i > MAX_LINKS_PER_SITEMAP) { break 2; } } } return $sites; }
/** * Tries to determine the language of the document by looking at the * $sample_text and $url provided * the language * @param string $sample_text sample text to try guess the language from * @param string $url url of web-page as a fallback look at the country * to figure out language * * @return string language tag for guessed language */ static function calculateLang($sample_text = NULL, $url = NULL) { if ($url != NULL) { $lang = UrlParser::getDocumentType($url); } return $lang; }
/** * Used to compute all the meta ids for a given link with $url * and $link_text that was on a site with $site_url. * * @param string $url url of the link * @param string $link_host url of the host name of the link * @param string $link_text text of the anchor tag link came from * @param string $site_url url of the page link was on */ static function calculateLinkMetas($url, $link_host, $link_text, $site_url) { global $IMAGE_TYPES; $link_meta_ids = array(); if (strlen($link_host) == 0) { continue; } if (substr($link_text, 0, 9) == "location:") { $location_link = true; $link_meta_ids[] = $link_text; $link_meta_ids[] = "location:all"; $link_meta_ids[] = "location:" . crawlHash($site_url); } $link_type = UrlParser::getDocumentType($url); $link_meta_ids[] = "media:all"; $link_meta_ids[] = "safe:all"; if (in_array($link_type, $IMAGE_TYPES)) { $link_meta_ids[] = "media:image"; if (isset($safe) && !$safe) { $link_meta_ids[] = "safe:false"; } } else { $link_meta_ids[] = "media:text"; } $link_meta_ids[] = "link:all"; return $link_meta_ids; }
/** * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT * This method attempts to cull from the doc_info struct the * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing * links which of filetype or sites the crawler is forbidden from crawl. * Then a crude estimate of the informaation contained in the links test: * strlen(gzip(text)) is used to extract the best remaining links. * * @param array& $doc_info a string with a CrawlConstants::LINKS subarray * This subarray in turn contains url => text pairs. * @param string $field field for links default is CrawlConstants::LINKS * @param int $member_cache_time says how long allowed and disallowed url * info should be caches by urlMemberSiteArray */ function pruneLinks(&$doc_info, $field = CrawlConstants::LINKS, $member_cache_time = 0) { if (!isset($doc_info[self::LINKS])) { return; } $links = array(); $allowed_name = "a" . $member_cache_time; $disallowed_name = "d" . $member_cache_time; foreach ($doc_info[$field] as $url => $text) { $doc_type = UrlParser::getDocumentType($url); if (!in_array($doc_type, $this->all_file_types)) { $doc_type = "unknown"; } if (!in_array($doc_type, $this->indexed_file_types)) { continue; } if ($this->restrict_sites_by_url) { if (!UrlParser::urlMemberSiteArray($url, $this->allowed_sites, $allowed_name)) { continue; } } if (UrlParser::urlMemberSiteArray($url, $this->disallowed_sites, $disallowed_name)) { continue; } $links[$url] = $text; } $doc_info[$field] = UrlParser::pruneLinks($links); }
/** * Gets the resource $_REQUEST['n'] from APP_DIR/$_REQUEST['f'] or * CRAWL_DIR/$_REQUEST['f'] after cleaning */ function get() { if (!isset($_REQUEST['n']) || !isset($_REQUEST['f'])) { return; } $name = $this->clean($_REQUEST['n'], "string"); if (in_array($_REQUEST['f'], array("css", "scripts", "resources"))) { /* notice in this case we don't check if request come from a legitimate source but we do try to restrict it to being a file (not a folder) in the above array */ $base_dir = $this->getBaseFolder(); if (!$base_dir) { header('HTTP/1.1 401 Unauthorized'); echo "<html><head><title>401 Unauthorized</title></head>" . "<body><p>401 Unauthorized</p></body></html>"; return; } $type = UrlParser::getDocumentType($name); $name = UrlParser::getDocumentFilename($name); $name = $type != "" ? "{$name}.{$type}" : $name; if (isset($_REQUEST['t'])) { $name .= ".jpg"; } } else { if (in_array($_REQUEST['f'], array("cache"))) { /* perform check since these request should come from a known machine */ if (!$this->checkRequest()) { return; } $folder = $_REQUEST['f']; $base_dir = CRAWL_DIR . "/{$folder}"; } else { return; } } if (isset($_REQUEST['o']) && isset($_REQUEST['l'])) { $offset = $this->clean($_REQUEST['o'], "int"); $limit = $this->clean($_REQUEST['l'], "int"); } $path = "{$base_dir}/{$name}"; if (file_exists($path)) { $mime_type = mimeType($path); $size = filesize($path); $start = 0; $end = $size - 1; header("Content-type: {$mime_type}"); header("Accept-Ranges: bytes"); if (isset($_SERVER['HTTP_RANGE'])) { $this->serveRangeRequest($path, $size, $start, $end); return; } header("Content-Length: " . $size); header("Content-Range: bytes {$start}-{$end}/{$size}"); if (isset($offset) && isset($limit)) { echo file_get_contents($path, false, NULL, $offset, $limit); } else { readfile($path); } } else { header("Location:./error.php"); } }