/**
  * Gets the next doc from the iterator
  * @param bool $no_process do not do any processing on page data
  * @return array associative array for doc or string if no_process true
  */
 function nextPage($no_process = false)
 {
     if (!$this->checkFileHandle()) {
         return NULL;
     }
     $indexable_records = array('response', 'resource');
     do {
         $this->getRecordStart();
         $page_info = $this->getWarcHeaders();
         if ($page_info == NULL || !isset($page_info[self::SIZE])) {
             return NULL;
         }
         $length = intval($page_info[self::SIZE]);
         $page_info[self::SIZE] = $length;
         $header_and_page = ltrim($this->fileRead($length + 2));
         $this->fileGets();
         $this->fileGets();
         if (!$header_and_page) {
             return NULL;
         }
     } while (!in_array($page_info['warc-type'], $indexable_records) || substr($page_info[self::URL], 0, 4) == 'dns:');
     //ignore warcinfo, request, metadata, revisit, etc. records
     if ($no_process) {
         return $header_and_page;
     }
     unset($page_info['line']);
     unset($page_info['warc-type']);
     $site = $page_info;
     $site_contents = FetchUrl::parseHeaderPage($header_and_page);
     $site = array_merge($site, $site_contents);
     $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
     $site[self::WEIGHT] = 1;
     if (!isset($site[self::TYPE])) {
         $site[self::TYPE] = "text/plain";
     }
     return $site;
 }
 /**
  * Gets the next doc from the iterator
  * @param bool $no_process do not do any processing on page data
  * @return array associative array for doc or string if no_process true
  */
 function nextPage($no_process = false)
 {
     if (!$this->checkFileHandle()) {
         return NULL;
     }
     do {
         $page_info = $this->fileGets();
         if (trim($page_info) == "") {
             return NULL;
         }
         $info_parts = explode(" ", $page_info);
         $num_parts = count($info_parts);
         $length = intval($info_parts[$num_parts - 1]);
         $header_and_page = $this->fileRead($length + 1);
         if (!$header_and_page) {
             return NULL;
         }
     } while (substr($page_info, 0, 3) == 'dns' || substr($page_info, 0, 8) == 'filedesc');
     //ignore dns entries in arc and ignore first record
     if ($no_process) {
         return $header_and_page;
     }
     $site = array();
     $site[self::URL] = $info_parts[0];
     $site[self::IP_ADDRESSES] = array($info_parts[1]);
     $site[self::TIMESTAMP] = date("U", strtotime($info_parts[2]));
     $site[self::TYPE] = $info_parts[3];
     $site_contents = FetchUrl::parseHeaderPage($header_and_page);
     $site = array_merge($site, $site_contents);
     $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
     $site[self::WEIGHT] = 1;
     return $site;
 }