Exemplo n.º 1
0
 /**
  * Creates an open directory rdf archive iterator with the given parameters.
  *
  * @param string $iterate_timestamp timestamp of the arc archive bundle to
  *     iterate  over the pages of
  * @param string $iterate_dir folder of files to iterate over
  * @param string $result_timestamp timestamp of the arc archive bundle
  *     results are being stored in
  * @param string $result_dir where to write last position checkpoints to
  */
 function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir)
 {
     $ini = array('compression' => 'gzip', 'file_extension' => 'gz', 'encoding' => 'UTF-8', 'start_delimiter' => '@Topic|ExternalPage@', 'end_delimiter' => '@/Topic|/ExternalPage@');
     parent::__construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini);
     $this->header['base_address'] = "http://www.dmoz.org/";
     $url_parts = @parse_url($this->header['base_address']);
     $this->header['ip_address'] = gethostbyname($url_parts['host']);
 }
Exemplo n.º 2
0
 /**
  * Restores the internal state from the file iterate_status.txt in the
  * result dir such that the next call to nextPages will pick up from just
  * after the last checkpoint. We also reset up our regex substitutions
  *
  * @return array the data serialized when saveCheckpoint was called
  */
 function restoreCheckPoint()
 {
     $info = parent::restoreCheckPoint();
     if (!$this->iterate_dir) {
         // do on client not name server
         $this->initializeSubstitutions($this->header['base_address']);
     }
     return $info;
 }
Exemplo n.º 3
0
 /**
  * Creates an warc archive iterator with the given parameters.
  *
  * @param string $iterate_timestamp timestamp of the arc archive bundle to
  *     iterate  over the pages of
  * @param string $iterate_dir folder of files to iterate over
  * @param string $result_timestamp timestamp of the arc archive bundle
  *     results are being stored in
  * @param string $result_dir where to write last position checkpoints to
  */
 function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir)
 {
     $ini = array('compression' => 'gzip', 'file_extension' => 'warc.gz', 'encoding' => 'UTF-8', 'start_delimiter' => '/WARC/');
     parent::__construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini);
 }