Пример #1
0
 /**
  * Creates an open directory rdf archive iterator with the given parameters.
  *
  * @param string $iterate_timestamp timestamp of the arc archive bundle to
  *     iterate  over the pages of
  * @param string $iterate_dir folder of files to iterate over
  * @param string $result_timestamp timestamp of the arc archive bundle
  *     results are being stored in
  * @param string $result_dir where to write last position checkpoints to
  */
 function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir)
 {
     $ini = array('compression' => 'gzip', 'file_extension' => 'gz', 'encoding' => 'UTF-8', 'start_delimiter' => '@Topic|ExternalPage@', 'end_delimiter' => '@/Topic|/ExternalPage@');
     parent::__construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini);
     $this->header['base_address'] = "http://www.dmoz.org/";
     $url_parts = @parse_url($this->header['base_address']);
     $this->header['ip_address'] = gethostbyname($url_parts['host']);
 }
Пример #2
0
 /**
  * Creates a media wiki archive iterator with the given parameters.
  *
  * @param string $iterate_timestamp timestamp of the arc archive bundle to
  *     iterate  over the pages of
  * @param string $iterate_dir folder of files to iterate over
  * @param string $result_timestamp timestamp of the arc archive bundle
  *     results are being stored in
  * @param string $result_dir where to write last position checkpoints to
  */
 function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir)
 {
     $ini = array('compression' => 'bzip2', 'file_extension' => 'bz2', 'encoding' => 'UTF-8', 'start_delimiter' => '@page@');
     parent::__construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini);
     $this->switch_partition_callback_name = "readMediaWikiHeader";
 }
Пример #3
0
 /**
  * Creates an warc archive iterator with the given parameters.
  *
  * @param string $iterate_timestamp timestamp of the arc archive bundle to
  *     iterate  over the pages of
  * @param string $iterate_dir folder of files to iterate over
  * @param string $result_timestamp timestamp of the arc archive bundle
  *     results are being stored in
  * @param string $result_dir where to write last position checkpoints to
  */
 function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir)
 {
     $ini = array('compression' => 'gzip', 'file_extension' => 'warc.gz', 'encoding' => 'UTF-8', 'start_delimiter' => '/WARC/');
     parent::__construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini);
 }