/** * Creates an open directory rdf archive iterator with the given parameters. * * @param string $iterate_timestamp timestamp of the arc archive bundle to * iterate over the pages of * @param string $iterate_dir folder of files to iterate over * @param string $result_timestamp timestamp of the arc archive bundle * results are being stored in * @param string $result_dir where to write last position checkpoints to */ function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir) { $ini = array('compression' => 'gzip', 'file_extension' => 'gz', 'encoding' => 'UTF-8', 'start_delimiter' => '@Topic|ExternalPage@', 'end_delimiter' => '@/Topic|/ExternalPage@'); parent::__construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini); $this->header['base_address'] = "http://www.dmoz.org/"; $url_parts = @parse_url($this->header['base_address']); $this->header['ip_address'] = gethostbyname($url_parts['host']); }
/** * Creates a media wiki archive iterator with the given parameters. * * @param string $iterate_timestamp timestamp of the arc archive bundle to * iterate over the pages of * @param string $iterate_dir folder of files to iterate over * @param string $result_timestamp timestamp of the arc archive bundle * results are being stored in * @param string $result_dir where to write last position checkpoints to */ function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir) { $ini = array('compression' => 'bzip2', 'file_extension' => 'bz2', 'encoding' => 'UTF-8', 'start_delimiter' => '@page@'); parent::__construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini); $this->switch_partition_callback_name = "readMediaWikiHeader"; }
/** * Creates an warc archive iterator with the given parameters. * * @param string $iterate_timestamp timestamp of the arc archive bundle to * iterate over the pages of * @param string $iterate_dir folder of files to iterate over * @param string $result_timestamp timestamp of the arc archive bundle * results are being stored in * @param string $result_dir where to write last position checkpoints to */ function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir) { $ini = array('compression' => 'gzip', 'file_extension' => 'warc.gz', 'encoding' => 'UTF-8', 'start_delimiter' => '/WARC/'); parent::__construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini); }