/** * Gets the next doc from the iterator * @param bool $no_process do not do any processing on page data * @return array associative array for doc or string if no_process true */ function nextPage($no_process = false) { if (!$this->checkFileHandle()) { return NULL; } $indexable_records = array('response', 'resource'); do { $this->getRecordStart(); $page_info = $this->getWarcHeaders(); if ($page_info == NULL || !isset($page_info[self::SIZE])) { return NULL; } $length = intval($page_info[self::SIZE]); $page_info[self::SIZE] = $length; $header_and_page = ltrim($this->fileRead($length + 2)); $this->fileGets(); $this->fileGets(); if (!$header_and_page) { return NULL; } } while (!in_array($page_info['warc-type'], $indexable_records) || substr($page_info[self::URL], 0, 4) == 'dns:'); //ignore warcinfo, request, metadata, revisit, etc. records if ($no_process) { return $header_and_page; } unset($page_info['line']); unset($page_info['warc-type']); $site = $page_info; $site_contents = FetchUrl::parseHeaderPage($header_and_page); $site = array_merge($site, $site_contents); $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); $site[self::WEIGHT] = 1; if (!isset($site[self::TYPE])) { $site[self::TYPE] = "text/plain"; } return $site; }
/** * Gets the next doc from the iterator * @param bool $no_process do not do any processing on page data * @return array associative array for doc or string if no_process true */ function nextPage($no_process = false) { if (!$this->checkFileHandle()) { return NULL; } do { $page_info = $this->fileGets(); if (trim($page_info) == "") { return NULL; } $info_parts = explode(" ", $page_info); $num_parts = count($info_parts); $length = intval($info_parts[$num_parts - 1]); $header_and_page = $this->fileRead($length + 1); if (!$header_and_page) { return NULL; } } while (substr($page_info, 0, 3) == 'dns' || substr($page_info, 0, 8) == 'filedesc'); //ignore dns entries in arc and ignore first record if ($no_process) { return $header_and_page; } $site = array(); $site[self::URL] = $info_parts[0]; $site[self::IP_ADDRESSES] = array($info_parts[1]); $site[self::TIMESTAMP] = date("U", strtotime($info_parts[2])); $site[self::TYPE] = $info_parts[3]; $site_contents = FetchUrl::parseHeaderPage($header_and_page); $site = array_merge($site, $site_contents); $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); $site[self::WEIGHT] = 1; return $site; }