/** * Creates an open directory rdf archive iterator with the given parameters. * * @param string $iterate_timestamp timestamp of the arc archive bundle to * iterate over the pages of * @param string $iterate_dir folder of files to iterate over * @param string $result_timestamp timestamp of the arc archive bundle * results are being stored in * @param string $result_dir where to write last position checkpoints to */ function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir) { $ini = array('compression' => 'gzip', 'file_extension' => 'gz', 'encoding' => 'UTF-8', 'start_delimiter' => '@Topic|ExternalPage@', 'end_delimiter' => '@/Topic|/ExternalPage@'); parent::__construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini); $this->header['base_address'] = "http://www.dmoz.org/"; $url_parts = @parse_url($this->header['base_address']); $this->header['ip_address'] = gethostbyname($url_parts['host']); }
/** * Restores the internal state from the file iterate_status.txt in the * result dir such that the next call to nextPages will pick up from just * after the last checkpoint. We also reset up our regex substitutions * * @return array the data serialized when saveCheckpoint was called */ function restoreCheckPoint() { $info = parent::restoreCheckPoint(); if (!$this->iterate_dir) { // do on client not name server $this->initializeSubstitutions($this->header['base_address']); } return $info; }
/** * Creates an warc archive iterator with the given parameters. * * @param string $iterate_timestamp timestamp of the arc archive bundle to * iterate over the pages of * @param string $iterate_dir folder of files to iterate over * @param string $result_timestamp timestamp of the arc archive bundle * results are being stored in * @param string $result_dir where to write last position checkpoints to */ function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir) { $ini = array('compression' => 'gzip', 'file_extension' => 'warc.gz', 'encoding' => 'UTF-8', 'start_delimiter' => '/WARC/'); parent::__construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini); }