Пример #1
0
 /**
  * Gets the next doc from the iterator
  * @param bool $no_process do not do any processing on page data
  * @return array associative array for doc or string if no_process true
  */
 function nextPage($no_process = false)
 {
     static $minimal_regexes = false;
     static $first_call = true;
     if ($first_call) {
         $this->initializeSubstitutions($this->header['base_address']);
     }
     $page_info = $this->getNextTagData("page");
     if ($no_process) {
         return $page_info;
     }
     $dom = new DOMDocument();
     @$dom->loadXML($page_info);
     $site = array();
     $pre_url = $this->getTextContent($dom, "/page/title");
     $pre_url = str_replace(" ", "_", $pre_url);
     $site[self::URL] = $this->header['base_address'] . $pre_url;
     $site[self::IP_ADDRESSES] = array($this->header['ip_address']);
     $pre_timestamp = $this->getTextContent($dom, "/page/revision/timestamp");
     $site[self::MODIFIED] = date("U", strtotime($pre_timestamp));
     $site[self::TIMESTAMP] = time();
     $site[self::TYPE] = "text/html";
     $site[self::HEADER] = "mediawiki_bundle_iterator extractor";
     $site[self::HTTP_CODE] = 200;
     $site[self::ENCODING] = "UTF-8";
     $site[self::SERVER] = "unknown";
     $site[self::SERVER_VERSION] = "unknown";
     $site[self::OPERATING_SYSTEM] = "unknown";
     $site[self::PAGE] = "<html lang='" . $this->header['lang'] . "' >\n" . "<head><title>{$pre_url}</title>\n" . WIKI_PAGE_STYLES . "\n</head>\n" . "<body><h1>{$pre_url}</h1>\n";
     $pre_page = $this->getTextContent($dom, "/page/revision/text");
     $current_hash = crawlHash($pre_page);
     if ($first_call) {
         $this->saveCheckPoint();
         //ensure we remember to advance one on fail
         $first_call = false;
     }
     $pre_page = $this->parser->parse($pre_page, false, true);
     $pre_page = preg_replace("/{{Other uses}}/i", "<div class='indent'>\"\$1\". (<a href='" . $site[self::URL] . "_(disambiguation)'>{$pre_url}</a>)</div>", $pre_page);
     $site[self::PAGE] .= $pre_page;
     $site[self::PAGE] .= "\n</body>\n</html>";
     $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
     $site[self::WEIGHT] = ceil(max(log(strlen($site[self::PAGE]) + 1, 2) - 10, 1));
     return $site;
 }
Пример #2
0
 /**
  * Gets the next doc from the iterator
  * @param bool $no_process if true then just return page string found
  *     not any additional meta data.
  * @return mixed associative array for doc or just string of doc
  */
 function nextPage($no_process = false)
 {
     if (!$this->checkFileHandle()) {
         return NULL;
     }
     $matches = array();
     while (preg_match($this->delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE) != 1) {
         crawlTimeoutLog("..still looking for a page in local buffer");
         $block = $this->getFileBlock();
         if (!$block || !$this->checkFileHandle() || $this->checkEof()) {
             return NULL;
         }
         $this->buffer .= $block;
     }
     $delim_len = strlen($matches[0][0]);
     $pos = $matches[0][1] + $delim_len;
     $page_pos = $this->start_delimiter == "" ? $pos : $pos - $delim_len;
     $page = substr($this->buffer, 0, $page_pos);
     if ($this->end_delimiter == "") {
         $page = $this->remainder . $page;
         $this->remainder = $matches[0][0];
     }
     $this->buffer = substr($this->buffer, $pos + $delim_len);
     if ($this->start_delimiter != "") {
         $matches = array();
         if (preg_match($this->start_delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE) != 1) {
             if (isset($matches[0][1])) {
                 $page = substr($page, $matches[0][1]);
             }
         }
     }
     if ($no_process == true) {
         return $page;
     }
     $site = array();
     $site[self::HEADER] = "text_archive_bundle_iterator extractor";
     $site[self::IP_ADDRESSES] = array("0.0.0.0");
     $site[self::TIMESTAMP] = date("U", time());
     $site[self::TYPE] = "text/plain";
     $site[self::PAGE] = $page;
     $site[self::HASH] = FetchUrl::computePageHash($page);
     $site[self::URL] = "record:" . webencode($site[self::HASH]);
     $site[self::HTTP_CODE] = 200;
     $site[self::ENCODING] = $this->encoding;
     $site[self::SERVER] = "unknown";
     $site[self::SERVER_VERSION] = "unknown";
     $site[self::OPERATING_SYSTEM] = "unknown";
     $site[self::WEIGHT] = 1;
     return $site;
 }
Пример #3
0
 /**
  * Gets the next doc from the iterator
  * @param bool $no_process do not do any processing on page data
  * @return array associative array for doc or string if no_process true
  */
 function nextPage($no_process = false)
 {
     if (!$this->checkFileHandle()) {
         return NULL;
     }
     $tag_data = $this->getNextTagsData(array("Topic", "ExternalPage"));
     if (!$tag_data) {
         return false;
     }
     list($page_info, $tag) = $tag_data;
     if ($no_process) {
         return $page_info;
     }
     $page_info = str_replace("r:id", "id", $page_info);
     $page_info = str_replace("r:resource", "resource", $page_info);
     $page_info = str_replace("d:Title", "Title", $page_info);
     $page_info = str_replace("d:Description", "Description", $page_info);
     $dom = new DOMDocument();
     $dom->loadXML($page_info);
     $processMethod = "process" . $tag;
     $site[self::IP_ADDRESSES] = array($this->header['ip_address']);
     $site[self::MODIFIED] = time();
     $site[self::TIMESTAMP] = time();
     $site[self::TYPE] = "text/html";
     $site[self::HEADER] = "odp_rdf_bundle_iterator extractor";
     $site[self::HTTP_CODE] = 200;
     $site[self::ENCODING] = "UTF-8";
     $site[self::SERVER] = "unknown";
     $site[self::SERVER_VERSION] = "unknown";
     $site[self::OPERATING_SYSTEM] = "unknown";
     $this->{$processMethod}($dom, $site);
     $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
     return $site;
 }
Пример #4
0
 /**
  * Processes an array of downloaded web pages with the appropriate page
  * processor.
  *
  * Summary data is extracted from each non robots.txt file in the array.
  * Disallowed paths and crawl-delays are extracted from robots.txt files.
  *
  * @param array $site_pages a collection of web pages to process
  * @return array summary data extracted from these pages
  */
 function processFetchPages($site_pages)
 {
     $PAGE_PROCESSORS = $this->page_processors;
     crawlLog("Start process pages... Current Memory:" . memory_get_usage());
     $start_time = microtime();
     $prefix = $this->fetcher_num . "-";
     $stored_site_pages = array();
     $summarized_site_pages = array();
     $num_items = $this->web_archive->count;
     $i = 0;
     foreach ($site_pages as $site) {
         $response_code = $site[self::HTTP_CODE];
         $was_error = false;
         if ($response_code < 200 || $response_code >= 300) {
             crawlLog($site[self::URL] . " response code {$response_code}");
             $host = UrlParser::getHost($site[self::URL]);
             if (!isset($this->hosts_with_errors[$host])) {
                 $this->hosts_with_errors[$host] = 0;
             }
             if ($response_code >= 400 || $response_code < 100) {
                 // < 100 will capture failures to connect which are returned
                 // as strings
                 $was_error = true;
                 $this->hosts_with_errors[$host]++;
             }
             /* we print out errors to std output. We still go ahead and
                   process the page. Maybe it is a cool error page, also
                   this makes sure we don't crawl it again
                */
         }
         // text/robot is my made up mimetype for robots.txt files
         $was_robot_error = false;
         if (isset($site[self::ROBOT_PATHS])) {
             if (!$was_error) {
                 $type = "text/robot";
             } else {
                 $type = $site[self::TYPE];
                 if ($response_code != 404) {
                     /*
                        disallow crawling if robots.txt was any error other
                        that not found
                     */
                     $was_robot_error = true;
                     $site[self::ROBOT_PATHS][] = "/";
                 }
             }
         } else {
             if (isset($site[self::FILE_NAME])) {
                 $extension = UrlParser::getDocumentType($site[self::FILE_NAME]);
                 if ($extension == $this->programming_language_extension['java']) {
                     $type = "text/java";
                 } else {
                     if ($extension == $this->programming_language_extension['py']) {
                         $type = "text/py";
                     } else {
                         $type = $site[self::TYPE];
                     }
                 }
             } else {
                 $type = $site[self::TYPE];
             }
         }
         $handled = false;
         /*deals with short URLs and directs them to the original link
           for robots.txt don't want to introduce stuff that can be
           mis-parsed (we follow redirects in this case anyway) */
         if (isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0 && strcmp($type, "text/robot") != 0) {
             array_unshift($site[self::LOCATION], $site[self::URL]);
             $tmp_loc = array_pop($site[self::LOCATION]);
             $tmp_loc = UrlParser::canonicalLink($tmp_loc, $site[self::URL]);
             $site[self::LOCATION] = array_push($site[self::LOCATION], $tmp_loc);
             $doc_info = array();
             $doc_info[self::LINKS][$tmp_loc] = "location:" . $site[self::URL];
             $doc_info[self::LOCATION] = true;
             $doc_info[self::DESCRIPTION] = $site[self::URL] . " => " . $tmp_loc;
             $doc_info[self::PAGE] = $doc_info[self::DESCRIPTION];
             $doc_info[self::TITLE] = $site[self::URL];
             $text_data = true;
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             $handled = true;
         } else {
             if (isset($PAGE_PROCESSORS[$type])) {
                 $page_processor = $PAGE_PROCESSORS[$type];
                 if (generalIsA($page_processor, "TextProcessor")) {
                     $text_data = true;
                 } else {
                     $text_data = false;
                 }
             } else {
                 crawlLog("No page processor for mime type: " . $type);
                 crawlLog("Not processing: " . $site[self::URL]);
                 continue;
             }
         }
         if (!$handled) {
             if (isset($this->plugin_processors[$page_processor])) {
                 $processor = new $page_processor($this->plugin_processors[$page_processor], $this->max_description_len, $this->summarizer_option);
             } else {
                 $processor = new $page_processor(array(), $this->max_description_len, $this->summarizer_option);
             }
         }
         if (isset($site[self::PAGE]) && !$handled) {
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             //if not UTF-8 convert before doing anything else
             if (isset($site[self::ENCODING]) && $site[self::ENCODING] != "UTF-8" && $site[self::ENCODING] != "" && generalIsA($page_processor, "TextProcessor")) {
                 if (!@mb_check_encoding($site[self::PAGE], $site[self::ENCODING])) {
                     crawlLog("  MB_CHECK_ENCODING FAILED!!");
                 }
                 crawlLog("  Converting from encoding " . $site[self::ENCODING] . "...");
                 //if HEBREW WINDOWS-1255 use ISO-8859 instead
                 if (stristr($site[self::ENCODING], "1255")) {
                     $site[self::ENCODING] = "ISO-8859-8";
                     crawlLog("  using encoding " . $site[self::ENCODING] . "...");
                 }
                 if (stristr($site[self::ENCODING], "1256")) {
                     $site[self::PAGE] = w1256ToUTF8($site[self::PAGE]);
                     crawlLog("  using Yioop hack encoding ...");
                 } else {
                     $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE], "UTF-8", $site[self::ENCODING]);
                 }
             }
             crawlLog("  Using Processor..." . $page_processor);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $tmp_url_store = $site[self::URL];
                 $site[self::URL] = $site[self::FILE_NAME];
             }
             $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $site[self::URL] = $tmp_url_store;
             }
             if (!$doc_info) {
                 crawlLog("  Processing Yielded No Data For: " . $site[self::URL]);
             }
             if ($page_processor != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) {
                 $this->pruneLinks($doc_info, CrawlConstants::LINKS, $start_time);
             }
         } else {
             if (!$handled) {
                 $doc_info = false;
             }
         }
         $not_loc = true;
         if ($doc_info) {
             $site[self::DOC_INFO] = $doc_info;
             if (isset($doc_info[self::LOCATION])) {
                 $site[self::HASH] = crawlHash(crawlHash($site[self::URL], true) . "LOCATION", true);
                 $not_loc = false;
             }
             $site[self::ROBOT_INSTANCE] = $prefix . ROBOT_INSTANCE;
             if (!is_dir(CRAWL_DIR . "/cache")) {
                 mkdir(CRAWL_DIR . "/cache");
                 $htaccess = "Options None\nphp_flag engine off\n";
                 file_put_contents(CRAWL_DIR . "/cache/.htaccess", $htaccess);
             }
             if ($type == "text/robot" && isset($doc_info[self::PAGE])) {
                 $site[self::PAGE] = $doc_info[self::PAGE];
             }
             if ($text_data) {
                 if (isset($doc_info[self::PAGE])) {
                     $site[self::PAGE] = $doc_info[self::PAGE];
                 } else {
                     $site[self::PAGE] = NULL;
                 }
                 if ($not_loc) {
                     $content = $doc_info[self::DESCRIPTION];
                     $site[self::HASH] = FetchUrl::computePageHash($content);
                 }
             } else {
                 $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
             }
             if (isset($doc_info[self::WORD_CLOUD])) {
                 $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD];
             } else {
                 $site[self::WORD_CLOUD] = NULL;
             }
             if (isset($doc_info[self::CRAWL_DELAY])) {
                 $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY];
             }
             if (isset($doc_info[self::ROBOT_PATHS]) && !$was_error) {
                 $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS];
             }
             if (!isset($site[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array();
             }
             if (isset($doc_info[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array_merge($site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]);
             }
             //here's where we enforce NOFOLLOW
             if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) || in_array("NONE", $site[self::ROBOT_METAS])) {
                 $site[self::DOC_INFO][self::LINKS] = array();
             }
             if (isset($doc_info[self::AGENT_LIST])) {
                 $site[self::AGENT_LIST] = $doc_info[self::AGENT_LIST];
             }
             $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages);
             $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $summarized_site_pages[$i][self::TITLE] = $site[self::FILE_NAME];
             } else {
                 $summarized_site_pages[$i][self::TITLE] = strip_tags($site[self::DOC_INFO][self::TITLE]);
                 // stripping html to be on the safe side
             }
             if (!isset($site[self::REPOSITORY_TYPE])) {
                 if ($was_robot_error) {
                     $site[self::DOC_INFO][self::DESCRIPTION] = "There was an HTTP error in trying to download " . "this robots.txt file, so all paths to this site " . "were dsallowed by Yioop.\n" . $site[self::DOC_INFO][self::DESCRIPTION];
                 }
                 $summarized_site_pages[$i][self::DESCRIPTION] = strip_tags($site[self::DOC_INFO][self::DESCRIPTION]);
             } else {
                 $summarized_site_pages[$i][self::DESCRIPTION] = $site[self::DOC_INFO][self::DESCRIPTION];
             }
             if (isset($site[self::DOC_INFO][self::JUST_METAS]) || isset($site[self::ROBOT_PATHS])) {
                 $summarized_site_pages[$i][self::JUST_METAS] = true;
             }
             if (isset($site[self::DOC_INFO][self::META_WORDS])) {
                 if (!isset($summarized_site_pages[$i][self::META_WORDS])) {
                     $summarized_site_pages[$i][self::META_WORDS] = $site[self::DOC_INFO][self::META_WORDS];
                 } else {
                     $summarized_site_pages[$i][self::META_WORDS] = array_merge($summarized_site_pages[$i][self::META_WORDS], $site[self::DOC_INFO][self::META_WORDS]);
                 }
             }
             if (isset($site[self::DOC_INFO][self::LANG])) {
                 if ($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") {
                     $site[self::DOC_INFO][self::LANG] = guessLangEncoding($site[self::ENCODING]);
                 }
                 $summarized_site_pages[$i][self::LANG] = $site[self::DOC_INFO][self::LANG];
             }
             if (isset($site[self::DOC_INFO][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS];
             }
             if (isset($site[self::DOC_INFO][self::WORD_CLOUD])) {
                 $summarized_site_pages[$i][self::WORD_CLOUD] = $site[self::DOC_INFO][self::WORD_CLOUD];
             }
             if (isset($site[self::DOC_INFO][self::THUMB])) {
                 $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB];
             }
             if (isset($site[self::DOC_INFO][self::SUBDOCS])) {
                 $this->processSubdocs($i, $site, $summarized_site_pages, $stored_site_pages);
             }
             if (isset($summarized_site_pages[$i][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = UrlParser::cleanRedundantLinks($summarized_site_pages[$i][self::LINKS], $summarized_site_pages[$i][self::URL]);
             }
             if (!empty($this->classifiers)) {
                 Classifier::labelPage($summarized_site_pages[$i], $this->classifiers, $this->active_classifiers, $this->active_rankers);
             }
             if ($this->page_rule_parser != NULL) {
                 $this->page_rule_parser->executeRuleTrees($summarized_site_pages[$i]);
             }
             $metas = isset($summarized_site_pages[$i][self::ROBOT_METAS]) ? $summarized_site_pages[$i][self::ROBOT_METAS] : array();
             if (array_intersect($metas, array("NOARCHIVE", "NOINDEX", "JUSTFOLLOW", "NONE")) != array()) {
                 $stored_site_pages[$i] = false;
             }
             $stored_site_pages[$i][self::INDEX] = $i;
             $i++;
         }
     }
     // end for
     $num_pages = count($stored_site_pages);
     $filter_stored = array_filter($stored_site_pages);
     if ($num_pages > 0 && $this->cache_pages) {
         $cache_page_partition = $this->web_archive->addPages(self::OFFSET, $filter_stored);
     } else {
         if ($num_pages > 0) {
             $this->web_archive->addCount(count($filter_stored));
         }
     }
     for ($i = 0; $i < $num_pages; $i++) {
         $summarized_site_pages[$i][self::INDEX] = $num_items + $i;
     }
     foreach ($filter_stored as $stored) {
         $i = $stored[self::INDEX];
         if (isset($stored[self::OFFSET])) {
             $summarized_site_pages[$i][self::OFFSET] = $stored[self::OFFSET];
             $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = $cache_page_partition;
         }
     }
     crawlLog("  Process pages time: " . changeInMicrotime($start_time) . " Current Memory: " . memory_get_usage());
     return $summarized_site_pages;
 }
Пример #5
0
 /**
  * Gets the next at most $num many docs from the iterator. It might return
  * less than $num many documents if the partition changes or the end of the
  * bundle is reached.
  *
  * @param int $num number of docs to get
  * @param bool $no_process do not do any processing on page data
  * @return array associative arrays for $num pages
  */
 function nextPages($num, $no_process = false)
 {
     $pages = array();
     $page_count = 0;
     $db = $this->db;
     $query = "{$this->sql} " . $db->limitOffset($this->limit, $num);
     $result = $db->execute($query);
     $i = 0;
     while ($row = $db->fetchArray($result)) {
         crawlTimeoutLog("..Still getting pages from archive iterator. At %s" . " of %s", $i, $num);
         $page = "";
         foreach ($row as $key => $value) {
             $page .= "{$key}{$this->field_value_separator}" . "{$value}{$this->column_separator}";
         }
         if ($no_process) {
             $pages[] = $page;
         } else {
             $site = array();
             $site[self::HEADER] = "database_bundle_iterator extractor";
             $site[self::IP_ADDRESSES] = array("0.0.0.0");
             $site[self::TIMESTAMP] = date("U", time());
             $site[self::TYPE] = "text/plain";
             $site[self::PAGE] = $page;
             $site[self::HASH] = FetchUrl::computePageHash($page);
             $site[self::URL] = "record:" . webencode($site[self::HASH]);
             $site[self::HTTP_CODE] = 200;
             $site[self::ENCODING] = $this->encoding;
             $site[self::SERVER] = "unknown";
             $site[self::SERVER_VERSION] = "unknown";
             $site[self::OPERATING_SYSTEM] = "unknown";
             $site[self::WEIGHT] = 1;
             $pages[] = $site;
         }
         $page_count++;
     }
     $this->limit += $page_count;
     if ($page_count < $num) {
         $this->end_of_iterator = true;
     }
     $this->saveCheckpoint();
     return $pages;
 }
Пример #6
0
 /**
  * Gets the next doc from the iterator
  * @param bool $no_process do not do any processing on page data
  * @return array associative array for doc or string if no_process true
  */
 function nextPage($no_process = false)
 {
     if (!$this->checkFileHandle()) {
         return NULL;
     }
     $indexable_records = array('response', 'resource');
     do {
         $this->getRecordStart();
         $page_info = $this->getWarcHeaders();
         if ($page_info == NULL || !isset($page_info[self::SIZE])) {
             return NULL;
         }
         $length = intval($page_info[self::SIZE]);
         $page_info[self::SIZE] = $length;
         $header_and_page = ltrim($this->fileRead($length + 2));
         $this->fileGets();
         $this->fileGets();
         if (!$header_and_page) {
             return NULL;
         }
     } while (!in_array($page_info['warc-type'], $indexable_records) || substr($page_info[self::URL], 0, 4) == 'dns:');
     //ignore warcinfo, request, metadata, revisit, etc. records
     if ($no_process) {
         return $header_and_page;
     }
     unset($page_info['line']);
     unset($page_info['warc-type']);
     $site = $page_info;
     $site_contents = FetchUrl::parseHeaderPage($header_and_page);
     $site = array_merge($site, $site_contents);
     $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
     $site[self::WEIGHT] = 1;
     if (!isset($site[self::TYPE])) {
         $site[self::TYPE] = "text/plain";
     }
     return $site;
 }
Пример #7
0
 /**
  * Gets the next doc from the iterator
  * @param bool $no_process do not do any processing on page data
  * @return array associative array for doc or string if no_process true
  */
 function nextPage($no_process = false)
 {
     if (!$this->checkFileHandle()) {
         return NULL;
     }
     do {
         $page_info = $this->fileGets();
         if (trim($page_info) == "") {
             return NULL;
         }
         $info_parts = explode(" ", $page_info);
         $num_parts = count($info_parts);
         $length = intval($info_parts[$num_parts - 1]);
         $header_and_page = $this->fileRead($length + 1);
         if (!$header_and_page) {
             return NULL;
         }
     } while (substr($page_info, 0, 3) == 'dns' || substr($page_info, 0, 8) == 'filedesc');
     //ignore dns entries in arc and ignore first record
     if ($no_process) {
         return $header_and_page;
     }
     $site = array();
     $site[self::URL] = $info_parts[0];
     $site[self::IP_ADDRESSES] = array($info_parts[1]);
     $site[self::TIMESTAMP] = date("U", strtotime($info_parts[2]));
     $site[self::TYPE] = $info_parts[3];
     $site_contents = FetchUrl::parseHeaderPage($header_and_page);
     $site = array_merge($site, $site_contents);
     $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
     $site[self::WEIGHT] = 1;
     return $site;
 }