/** parse directory of files */
 function parse_dir()
 {
     $ignore = array("..", '.', '.DS_STORE', "0");
     $this->setCheckPoint('dataset');
     $prefix = parent::getPrefix();
     $bVersion = parent::getParameterValue('bio2rdf_release');
     $date = date("Y-m-d\\TG:i:s\\Z");
     $dataset_file = parent::getParameterValue("outdir") . parent::getBio2RDFReleaseFile();
     $fp = fopen($dataset_file, "w");
     if ($fp === FALSE) {
         trigger_error("Unable to open {$dataset_file}", E_USER_ERROR);
         return false;
     }
     $ids = explode(",", parent::getParameterValue('id_list'));
     $indir = parent::getParameterValue('indir');
     echo "Processing {$indir}\n";
     $outfile = "clinicaltrials." . parent::getParameterValue('output_format');
     $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
     parent::setWriteFile(parent::getParameterValue("outdir") . $outfile, $gz);
     $files = glob($indir . "NCT*");
     foreach ($files as $i => $file) {
         if ($i % 10000 == 0) {
             parent::clear();
         }
         $trial_id = basename($file, '.xml');
         if (parent::getParameterValue('id_list') == '' || in_array($trial_id, $ids)) {
             if (filesize($file) != 0) {
                 echo "Processing {$trial_id}" . PHP_EOL;
                 $this->process_file($file);
             } else {
                 echo "Processing {$trial_id} -> Empty!" . PHP_EOL;
             }
         }
     }
     echo "Finished." . PHP_EOL;
     parent::getWriteFile()->close();
     // make the dataset description
     parent::setGraphURI(parent::getDatasetURI());
     $rfile = "http://clinicaltrials.gov/ct2/show/NCT_ID?resultsxml=true";
     $source_version = parent::getDatasetVersion();
     // dataset description
     $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Clinicaltrials")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($file)))->setFormat("application/xml")->setPublisher("http://clinicaltrials.gov/")->setHomepage("http://clinicaltrials.gov/")->setRights("use")->setRights("by-attribution")->setLicense("http://clinicaltrials.gov/ct2/about-site/terms-conditions")->setDataset("http://identifiers.org/clinicaltrials/");
     parent::writeToReleaseFile($source_file->toRDF());
     $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} v{$source_version}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/clinicaltrials/clinicaltrials.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
     $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
     if ($gz) {
         $output_file->setFormat("application/gzip");
     }
     if (strstr(parent::getParameterValue('output_format'), "nt")) {
         $output_file->setFormat("application/n-triples");
     } else {
         $output_file->setFormat("application/n-quads");
     }
     parent::writeToReleaseFile($output_file->toRDF());
     parent::closeReleaseFile();
     // write the dataset description file
     fclose($fp);
 }
Exemple #2
0
 function Run()
 {
     $indir = parent::getParameterValue('indir');
     $outdir = parent::getParameterValue('outdir');
     $download_url = parent::getParameterValue('download_url');
     if (parent::getParameterValue('files') == 'all') {
         $files = explode("|", parent::getParameterList('files'));
         array_shift($files);
     } else {
         $files = explode("|", parent::getParameterValue('files'));
     }
     if (parent::getParameterValue("id_list")) {
         $this->id_list = array_flip(explode(",", parent::getParameterValue('id_list')));
     }
     $dataset_description = '';
     foreach ($files as $f) {
         if ($f == 'drugbank') {
             $file = 'drugbank.xml.zip';
             $lname = 'drugbank';
         }
         $fnx = 'parse_' . $f;
         $rfile = parent::getParameterValue('download_url') . $file;
         $lfile = parent::getParameterValue('indir') . $file;
         $cfile = $lname . "." . parent::getParameterValue('output_format');
         // download
         if (!file_exists($lfile) || parent::getParameterValue('download') == true) {
             utils::downloadSingle($rfile, $lfile);
         }
         // setup the write
         $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
         parent::setWriteFile($outdir . $cfile, $gz);
         echo $outdir . $cfile;
         if (file_exists($indir . $file)) {
             // call the parser
             echo "processing {$file} ..." . PHP_EOL;
             $this->{$fnx}($indir, $file);
             echo "done" . PHP_EOL;
             parent::clear();
         }
         parent::getWriteFile()->close();
         // dataset description
         $ouri = parent::getGraphURI();
         parent::setGraphURI(parent::getDatasetURI());
         $source_version = parent::getDatasetVersion();
         $bVersion = parent::getParameterValue('bio2rdf_release');
         $prefix = parent::getPrefix();
         $date = date("Y-m-d\\TH:i:sP");
         // dataset description
         $source_file = (new DataResource($this))->setURI($rfile)->setTitle("DrugBank ({$file})")->setRetrievedDate(date("Y-m-d\\TH:i:sP", filemtime($indir . $file)))->setFormat("application/xml")->setFormat("application/zip")->setPublisher("http://drugbank.ca")->setHomepage("http://drugbank.ca")->setRights("use")->setRights("by-attribution")->setRights("no-commercial")->setLicense("http://www.drugbank.ca/about")->setDataset("http://identifiers.org/drugbank/");
         $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$cfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} v{$source_version}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/drugbank/drugbank.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
         $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
         if ($gz) {
             $output_file->setFormat("application/gzip");
         }
         if (strstr(parent::getParameterValue('output_format'), "nt")) {
             $output_file->setFormat("application/n-triples");
         } else {
             $output_file->setFormat("application/n-quads");
         }
         parent::writeToReleaseFile($source_file->toRDF() . $output_file->toRDF());
         parent::setGraphURI($ouri);
     }
     parent::closeReleaseFile();
 }