/** * Calculates the meta words to be associated with a given downloaded * document. These words will be associated with the document in the * index for (server:apache) even if the document itself did not contain * them. * * @param array& $site associated array containing info about a downloaded * (or read from archive) document. * @param array $video_sources used to check if a page should be marked as * having meta media:video * @return array of meta words to be associate with this document */ static function calculateMetas(&$site, $video_sources = array()) { $meta_ids = array(); // handles user added meta words if (isset($site[CrawlConstants::META_WORDS])) { $meta_ids = $site[CrawlConstants::META_WORDS]; } /* Handle the built-in meta words. For example store the sites the doc_key belongs to, so you can search by site */ $url_sites = UrlParser::getHostPaths($site[CrawlConstants::URL]); $url_sites = array_merge($url_sites, UrlParser::getHostSubdomains($site[CrawlConstants::URL])); $meta_ids[] = 'site:all'; foreach ($url_sites as $url_site) { if (strlen($url_site) > 0) { $meta_ids[] = 'site:' . $url_site; } } $path = UrlParser::getPath($site[CrawlConstants::URL]); if (strlen($path) > 0) { $path_parts = explode("/", $path); $pre_path = ""; $meta_ids[] = 'path:all'; $meta_ids[] = 'path:/'; foreach ($path_parts as $part) { if (strlen($part) > 0) { $pre_path .= "/{$part}"; $meta_ids[] = 'path:' . $pre_path; } } } $meta_ids[] = 'info:' . $site[CrawlConstants::URL]; $meta_ids[] = 'info:' . crawlHash($site[CrawlConstants::URL]); $meta_ids[] = 'code:all'; $meta_ids[] = 'code:' . $site[CrawlConstants::HTTP_CODE]; if (UrlParser::getHost($site[CrawlConstants::URL]) . "/" == $site[CrawlConstants::URL]) { $meta_ids[] = 'host:all'; //used to count number of distinct hosts } if (isset($site[CrawlConstants::SIZE])) { $meta_ids[] = "size:all"; $interval = DOWNLOAD_SIZE_INTERVAL; $size = floor($site[CrawlConstants::SIZE] / $interval) * $interval; $meta_ids[] = "size:{$size}"; } if (isset($site[CrawlConstants::TOTAL_TIME])) { $meta_ids[] = "time:all"; $interval = DOWNLOAD_TIME_INTERVAL; $time = floor($site[CrawlConstants::TOTAL_TIME] / $interval) * $interval; $meta_ids[] = "time:{$time}"; } if (isset($site[CrawlConstants::DNS_TIME])) { $meta_ids[] = "dns:all"; $interval = DOWNLOAD_TIME_INTERVAL; $time = floor($site[CrawlConstants::DNS_TIME] / $interval) * $interval; $meta_ids[] = "dns:{$time}"; } if (isset($site[CrawlConstants::LINKS])) { $num_links = count($site[CrawlConstants::LINKS]); $meta_ids[] = "numlinks:all"; $meta_ids[] = "numlinks:{$num_links}"; $link_urls = array_keys($site[CrawlConstants::LINKS]); $meta_ids[] = "link:all"; foreach ($link_urls as $url) { $meta_ids[] = 'link:' . $url; $meta_ids[] = 'link:' . crawlHash($url); } } if (isset($site[CrawlConstants::LOCATION]) && is_array($site[CrawlConstants::LOCATION])) { foreach ($site[CrawlConstants::LOCATION] as $location) { $meta_ids[] = 'info:' . $location; $meta_ids[] = 'info:' . crawlHash($location); $meta_ids[] = 'location:all'; $meta_ids[] = 'location:' . $location; } } if (isset($site[CrawlConstants::IP_ADDRESSES])) { $meta_ids[] = 'ip:all'; foreach ($site[CrawlConstants::IP_ADDRESSES] as $address) { $meta_ids[] = 'ip:' . $address; } } $meta_ids[] = 'media:all'; if ($video_sources != array()) { if (UrlParser::isVideoUrl($site[CrawlConstants::URL], $video_sources)) { $meta_ids[] = "media:video"; } else { $meta_ids[] = stripos($site[CrawlConstants::TYPE], "image") !== false ? 'media:image' : 'media:text'; } } // store the filetype info $url_type = UrlParser::getDocumentType($site[CrawlConstants::URL]); if (strlen($url_type) > 0) { $meta_ids[] = 'filetype:all'; $meta_ids[] = 'filetype:' . $url_type; } if (isset($site[CrawlConstants::SERVER])) { $meta_ids[] = 'server:all'; $meta_ids[] = 'server:' . strtolower($site[CrawlConstants::SERVER]); } if (isset($site[CrawlConstants::SERVER_VERSION])) { $meta_ids[] = 'version:all'; $meta_ids[] = 'version:' . $site[CrawlConstants::SERVER_VERSION]; } if (isset($site[CrawlConstants::OPERATING_SYSTEM])) { $meta_ids[] = 'os:all'; $meta_ids[] = 'os:' . strtolower($site[CrawlConstants::OPERATING_SYSTEM]); } if (isset($site[CrawlConstants::MODIFIED])) { $modified = $site[CrawlConstants::MODIFIED]; $meta_ids[] = 'modified:all'; $meta_ids[] = 'modified:' . date('Y', $modified); $meta_ids[] = 'modified:' . date('Y-m', $modified); $meta_ids[] = 'modified:' . date('Y-m-d', $modified); } if (isset($site[CrawlConstants::TIMESTAMP])) { $date = $site[CrawlConstants::TIMESTAMP]; $meta_ids[] = 'date:all'; $meta_ids[] = 'date:' . date('Y', $date); $meta_ids[] = 'date:' . date('Y-m', $date); $meta_ids[] = 'date:' . date('Y-m-d', $date); $meta_ids[] = 'date:' . date('Y-m-d-H', $date); $meta_ids[] = 'date:' . date('Y-m-d-H-i', $date); $meta_ids[] = 'date:' . date('Y-m-d-H-i-s', $date); } if (isset($site[CrawlConstants::LANG])) { $meta_ids[] = 'lang:all'; $lang_parts = explode("-", $site[CrawlConstants::LANG]); $meta_ids[] = 'lang:' . $lang_parts[0]; if (isset($lang_parts[1])) { $meta_ids[] = 'lang:' . $site[CrawlConstants::LANG]; } } if (isset($site[CrawlConstants::AGENT_LIST])) { foreach ($site[CrawlConstants::AGENT_LIST] as $agent) { $meta_ids[] = 'robot:' . strtolower($agent); } } //Add all meta word for subdoctype if (isset($site[CrawlConstants::SUBDOCTYPE])) { $meta_ids[] = $site[CrawlConstants::SUBDOCTYPE] . ':all'; } return $meta_ids; }
/** * Calculates the company level domain for the given url * * For www.yahoo.com the cld is yahoo.com, for * www.theregister.co.uk it is theregister.co.uk. It is * similar for organizations. * * @param string $url url to determine cld for * @return string the cld of $url */ function getCompanyLevelDomain($url) { $subdomains = UrlParser::getHostSubdomains($url); if (!isset($subdomains[0]) || !isset($subdomains[2])) { return ""; } /* if $url is www.yahoo.com $subdomains[0] == com, $subdomains[1] == .com, $subdomains[2] == yahoo.com,$subdomains[3] == .yahoo.com etc. */ if (strlen($subdomains[0]) == 2 && strlen($subdomains[2]) == 5 && isset($subdomains[4])) { return $subdomains[4]; } return $subdomains[2]; }