/** * Calculates the meta words to be associated with a given downloaded * document. These words will be associated with the document in the * index for (server:apache) even if the document itself did not contain * them. * * @param array& $site associated array containing info about a downloaded * (or read from archive) document. * @param array $video_sources used to check if a page should be marked as * having meta media:video * @return array of meta words to be associate with this document */ static function calculateMetas(&$site, $video_sources = array()) { $meta_ids = array(); // handles user added meta words if (isset($site[CrawlConstants::META_WORDS])) { $meta_ids = $site[CrawlConstants::META_WORDS]; } /* Handle the built-in meta words. For example store the sites the doc_key belongs to, so you can search by site */ $url_sites = UrlParser::getHostPaths($site[CrawlConstants::URL]); $url_sites = array_merge($url_sites, UrlParser::getHostSubdomains($site[CrawlConstants::URL])); $meta_ids[] = 'site:all'; foreach ($url_sites as $url_site) { if (strlen($url_site) > 0) { $meta_ids[] = 'site:' . $url_site; } } $path = UrlParser::getPath($site[CrawlConstants::URL]); if (strlen($path) > 0) { $path_parts = explode("/", $path); $pre_path = ""; $meta_ids[] = 'path:all'; $meta_ids[] = 'path:/'; foreach ($path_parts as $part) { if (strlen($part) > 0) { $pre_path .= "/{$part}"; $meta_ids[] = 'path:' . $pre_path; } } } $meta_ids[] = 'info:' . $site[CrawlConstants::URL]; $meta_ids[] = 'info:' . crawlHash($site[CrawlConstants::URL]); $meta_ids[] = 'code:all'; $meta_ids[] = 'code:' . $site[CrawlConstants::HTTP_CODE]; if (UrlParser::getHost($site[CrawlConstants::URL]) . "/" == $site[CrawlConstants::URL]) { $meta_ids[] = 'host:all'; //used to count number of distinct hosts } if (isset($site[CrawlConstants::SIZE])) { $meta_ids[] = "size:all"; $interval = DOWNLOAD_SIZE_INTERVAL; $size = floor($site[CrawlConstants::SIZE] / $interval) * $interval; $meta_ids[] = "size:{$size}"; } if (isset($site[CrawlConstants::TOTAL_TIME])) { $meta_ids[] = "time:all"; $interval = DOWNLOAD_TIME_INTERVAL; $time = floor($site[CrawlConstants::TOTAL_TIME] / $interval) * $interval; $meta_ids[] = "time:{$time}"; } if (isset($site[CrawlConstants::DNS_TIME])) { $meta_ids[] = "dns:all"; $interval = DOWNLOAD_TIME_INTERVAL; $time = floor($site[CrawlConstants::DNS_TIME] / $interval) * $interval; $meta_ids[] = "dns:{$time}"; } if (isset($site[CrawlConstants::LINKS])) { $num_links = count($site[CrawlConstants::LINKS]); $meta_ids[] = "numlinks:all"; $meta_ids[] = "numlinks:{$num_links}"; $link_urls = array_keys($site[CrawlConstants::LINKS]); $meta_ids[] = "link:all"; foreach ($link_urls as $url) { $meta_ids[] = 'link:' . $url; $meta_ids[] = 'link:' . crawlHash($url); } } if (isset($site[CrawlConstants::LOCATION]) && is_array($site[CrawlConstants::LOCATION])) { foreach ($site[CrawlConstants::LOCATION] as $location) { $meta_ids[] = 'info:' . $location; $meta_ids[] = 'info:' . crawlHash($location); $meta_ids[] = 'location:all'; $meta_ids[] = 'location:' . $location; } } if (isset($site[CrawlConstants::IP_ADDRESSES])) { $meta_ids[] = 'ip:all'; foreach ($site[CrawlConstants::IP_ADDRESSES] as $address) { $meta_ids[] = 'ip:' . $address; } } $meta_ids[] = 'media:all'; if ($video_sources != array()) { if (UrlParser::isVideoUrl($site[CrawlConstants::URL], $video_sources)) { $meta_ids[] = "media:video"; } else { $meta_ids[] = stripos($site[CrawlConstants::TYPE], "image") !== false ? 'media:image' : 'media:text'; } } // store the filetype info $url_type = UrlParser::getDocumentType($site[CrawlConstants::URL]); if (strlen($url_type) > 0) { $meta_ids[] = 'filetype:all'; $meta_ids[] = 'filetype:' . $url_type; } if (isset($site[CrawlConstants::SERVER])) { $meta_ids[] = 'server:all'; $meta_ids[] = 'server:' . strtolower($site[CrawlConstants::SERVER]); } if (isset($site[CrawlConstants::SERVER_VERSION])) { $meta_ids[] = 'version:all'; $meta_ids[] = 'version:' . $site[CrawlConstants::SERVER_VERSION]; } if (isset($site[CrawlConstants::OPERATING_SYSTEM])) { $meta_ids[] = 'os:all'; $meta_ids[] = 'os:' . strtolower($site[CrawlConstants::OPERATING_SYSTEM]); } if (isset($site[CrawlConstants::MODIFIED])) { $modified = $site[CrawlConstants::MODIFIED]; $meta_ids[] = 'modified:all'; $meta_ids[] = 'modified:' . date('Y', $modified); $meta_ids[] = 'modified:' . date('Y-m', $modified); $meta_ids[] = 'modified:' . date('Y-m-d', $modified); } if (isset($site[CrawlConstants::TIMESTAMP])) { $date = $site[CrawlConstants::TIMESTAMP]; $meta_ids[] = 'date:all'; $meta_ids[] = 'date:' . date('Y', $date); $meta_ids[] = 'date:' . date('Y-m', $date); $meta_ids[] = 'date:' . date('Y-m-d', $date); $meta_ids[] = 'date:' . date('Y-m-d-H', $date); $meta_ids[] = 'date:' . date('Y-m-d-H-i', $date); $meta_ids[] = 'date:' . date('Y-m-d-H-i-s', $date); } if (isset($site[CrawlConstants::LANG])) { $meta_ids[] = 'lang:all'; $lang_parts = explode("-", $site[CrawlConstants::LANG]); $meta_ids[] = 'lang:' . $lang_parts[0]; if (isset($lang_parts[1])) { $meta_ids[] = 'lang:' . $site[CrawlConstants::LANG]; } } if (isset($site[CrawlConstants::AGENT_LIST])) { foreach ($site[CrawlConstants::AGENT_LIST] as $agent) { $meta_ids[] = 'robot:' . strtolower($agent); } } //Add all meta word for subdoctype if (isset($site[CrawlConstants::SUBDOCTYPE])) { $meta_ids[] = $site[CrawlConstants::SUBDOCTYPE] . ':all'; } return $meta_ids; }