Ejemplo n.º 1
0
 /** parse directory of files */
 function parse_dir()
 {
     $ignore = array("..", '.', '.DS_STORE', "0");
     $this->setCheckPoint('dataset');
     $prefix = parent::getPrefix();
     $bVersion = parent::getParameterValue('bio2rdf_release');
     $date = date("Y-m-d\\TG:i:s\\Z");
     $dataset_file = parent::getParameterValue("outdir") . parent::getBio2RDFReleaseFile();
     $fp = fopen($dataset_file, "w");
     if ($fp === FALSE) {
         trigger_error("Unable to open {$dataset_file}", E_USER_ERROR);
         return false;
     }
     $ids = explode(",", parent::getParameterValue('id_list'));
     $indir = parent::getParameterValue('indir');
     echo "Processing {$indir}\n";
     $outfile = "clinicaltrials." . parent::getParameterValue('output_format');
     $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
     parent::setWriteFile(parent::getParameterValue("outdir") . $outfile, $gz);
     $files = glob($indir . "NCT*");
     foreach ($files as $i => $file) {
         if ($i % 10000 == 0) {
             parent::clear();
         }
         $trial_id = basename($file, '.xml');
         if (parent::getParameterValue('id_list') == '' || in_array($trial_id, $ids)) {
             if (filesize($file) != 0) {
                 echo "Processing {$trial_id}" . PHP_EOL;
                 $this->process_file($file);
             } else {
                 echo "Processing {$trial_id} -> Empty!" . PHP_EOL;
             }
         }
     }
     echo "Finished." . PHP_EOL;
     parent::getWriteFile()->close();
     // make the dataset description
     parent::setGraphURI(parent::getDatasetURI());
     $rfile = "http://clinicaltrials.gov/ct2/show/NCT_ID?resultsxml=true";
     $source_version = parent::getDatasetVersion();
     // dataset description
     $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Clinicaltrials")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($file)))->setFormat("application/xml")->setPublisher("http://clinicaltrials.gov/")->setHomepage("http://clinicaltrials.gov/")->setRights("use")->setRights("by-attribution")->setLicense("http://clinicaltrials.gov/ct2/about-site/terms-conditions")->setDataset("http://identifiers.org/clinicaltrials/");
     parent::writeToReleaseFile($source_file->toRDF());
     $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} v{$source_version}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/clinicaltrials/clinicaltrials.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
     $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
     if ($gz) {
         $output_file->setFormat("application/gzip");
     }
     if (strstr(parent::getParameterValue('output_format'), "nt")) {
         $output_file->setFormat("application/n-triples");
     } else {
         $output_file->setFormat("application/n-quads");
     }
     parent::writeToReleaseFile($output_file->toRDF());
     parent::closeReleaseFile();
     // write the dataset description file
     fclose($fp);
 }
Ejemplo n.º 2
0
 function Run()
 {
     $indir = parent::getParameterValue('indir');
     $outdir = parent::getParameterValue('outdir');
     $download_url = parent::getParameterValue('download_url');
     if (parent::getParameterValue('files') == 'all') {
         $files = explode("|", parent::getParameterList('files'));
         array_shift($files);
     } else {
         $files = explode("|", parent::getParameterValue('files'));
     }
     if (parent::getParameterValue("id_list")) {
         $this->id_list = array_flip(explode(",", parent::getParameterValue('id_list')));
     }
     $dataset_description = '';
     foreach ($files as $f) {
         if ($f == 'drugbank') {
             $file = 'drugbank.xml.zip';
             $lname = 'drugbank';
         }
         $fnx = 'parse_' . $f;
         $rfile = parent::getParameterValue('download_url') . $file;
         $lfile = parent::getParameterValue('indir') . $file;
         $cfile = $lname . "." . parent::getParameterValue('output_format');
         // download
         if (!file_exists($lfile) || parent::getParameterValue('download') == true) {
             utils::downloadSingle($rfile, $lfile);
         }
         // setup the write
         $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
         parent::setWriteFile($outdir . $cfile, $gz);
         echo $outdir . $cfile;
         if (file_exists($indir . $file)) {
             // call the parser
             echo "processing {$file} ..." . PHP_EOL;
             $this->{$fnx}($indir, $file);
             echo "done" . PHP_EOL;
             parent::clear();
         }
         parent::getWriteFile()->close();
         // dataset description
         $ouri = parent::getGraphURI();
         parent::setGraphURI(parent::getDatasetURI());
         $source_version = parent::getDatasetVersion();
         $bVersion = parent::getParameterValue('bio2rdf_release');
         $prefix = parent::getPrefix();
         $date = date("Y-m-d\\TH:i:sP");
         // dataset description
         $source_file = (new DataResource($this))->setURI($rfile)->setTitle("DrugBank ({$file})")->setRetrievedDate(date("Y-m-d\\TH:i:sP", filemtime($indir . $file)))->setFormat("application/xml")->setFormat("application/zip")->setPublisher("http://drugbank.ca")->setHomepage("http://drugbank.ca")->setRights("use")->setRights("by-attribution")->setRights("no-commercial")->setLicense("http://www.drugbank.ca/about")->setDataset("http://identifiers.org/drugbank/");
         $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$cfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} v{$source_version}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/drugbank/drugbank.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
         $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
         if ($gz) {
             $output_file->setFormat("application/gzip");
         }
         if (strstr(parent::getParameterValue('output_format'), "nt")) {
             $output_file->setFormat("application/n-triples");
         } else {
             $output_file->setFormat("application/n-quads");
         }
         parent::writeToReleaseFile($source_file->toRDF() . $output_file->toRDF());
         parent::setGraphURI($ouri);
     }
     parent::closeReleaseFile();
 }
Ejemplo n.º 3
0
 function Run()
 {
     $dataset_description = '';
     $ldir = parent::getParameterValue('indir');
     $odir = parent::getParameterValue('outdir');
     $list_file = $ldir . "ftp_list.txt";
     if (!file_exists($list_file) || $this->getParameterValue('download') == true) {
         echo "Getting FTP file list ...";
         $list = $this->getFtpFileList('ftp.ncbi.nlm.nih.gov', '/refseq/release/complete/', '/(complete\\.[0-9]+\\.protein\\.gpff\\.gz)/');
         if (!isset($list) or count($list) == 0) {
             trigger_error("Unable to get list of files from FTP site. Check internet connection", E_USER_ERROR);
             exit(-1);
         }
         asort($list);
         $buf = implode("\n", $list);
         file_put_contents($list_file, $buf);
         echo "Done." . PHP_EOL;
     } else {
         echo "Using existing ftp list" . PHP_EOL;
         $list = explode("\n", file_get_contents($list_file));
     }
     $counter = 1;
     $total = count($list);
     foreach ($list as $f) {
         $lfile = $ldir . $f;
         echo "Processing " . $counter++ . "/{$total} {$f}. ";
         if (!file_exists($lfile) || $this->getParameterValue('download') == true) {
             $rfile = parent::getParameterValue('download_url') . $f;
             echo "Downloading ...";
             utils::DownloadSingle($rfile, $lfile);
             echo "done.";
         } else {
             echo "Using existing file.";
         }
         echo PHP_EOL;
     }
     //if download
     //iterate over the files
     $files = $this->getFilePaths($ldir, 'gz');
     asort($files);
     foreach ($files as $f) {
         $lfile = $ldir . $f;
         $ofile = $odir . basename($f, ".gz") . "." . parent::getParameterValue('output_format');
         $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false;
         parent::setWriteFile($ofile, $gz);
         parent::setReadFile($lfile, true);
         echo "processing {$f} ...";
         $this->process();
         parent::clear();
         echo "done!" . PHP_EOL;
         $this->getReadFile()->close();
         $this->getWriteFile()->close();
         $source_file = (new DataResource($this))->setURI(parent::getParameterValue('download_url') . $lfile)->setTitle("NCBI RefSeq - {$f}")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat('text/refseq-format')->setFormat('application/zip')->setPublisher('http://www.ncbi.nlm.nih.gov')->setHomepage('http://www.ncbi.nlm.nih.gov/refseq')->setRights('use')->setRights('attribution')->setLicense('http://www.nlm.nih.gov/copyright.html')->setDataset(parent::getDatasetURI());
         $prefix = parent::getPrefix();
         $bVersion = parent::getParameterValue('bio2rdf_release');
         $date = date("Y-m-d\\TG:i:s\\Z");
         $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} - {$f}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/refseq/refseq.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("restricted-by-source-license")->setLicense("http://creativecommons/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
         $dataset_description .= $output_file->toRDF() . $source_file->toRDF();
     }
     //for
     parent::writeToReleaseFile($dataset_description);
     parent::getWriteFile()->close();
 }
Ejemplo n.º 4
0
 function run()
 {
     $ldir = parent::getParameterValue('indir');
     $odir = parent::getParameterValue('outdir');
     $dd = '';
     $files = parent::getParameterValue('files');
     if ($files == 'all') {
         $files = explode('|', parent::getParameterList('files'));
         array_shift($files);
     } else {
         $files = explode(',', parent::getParameterValue('files'));
     }
     foreach ($files as $file) {
         echo "processing {$file} ...";
         $lfile = $ldir . $this->filemap[$file];
         $rfile = parent::getParameterValue('download_url') . $this->filemap[$file];
         if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') {
             $ret = utils::downloadSingle($rfile, $lfile);
             if ($ret === false) {
                 echo "unable to download {$file} ... skipping" . PHP_EOL;
                 continue;
             }
         }
         parent::setReadFile($lfile, true);
         $suffix = parent::getParameterValue('output_format');
         $ofile = "orphanet-" . $file . '.' . $suffix;
         $gz = strstr(parent::getParameterValue('output_format'), "gz") ? $gz = true : ($gz = false);
         /*			parent::setWriteFile($odir.$ofile, $gz);
         			$this->$file($lfile);
         			parent::getWriteFile()->close();
         */
         parent::getReadFile()->close();
         parent::clear();
         echo "done!" . PHP_EOL;
         // dataset description
         $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Orphanet: {$file}")->setRetrievedDate(parent::getDate(filemtime($lfile)))->setFormat("application/xml")->setPublisher("http://www.orpha.net")->setHomepage("http://www.orpha.net/")->setRights("use")->setRights("sharing-modified-version-needs-permission")->setLicense("http://creativecommons.org/licenses/by-nd/3.0/")->setDataset("http://identifiers.org/orphanet/");
         $prefix = parent::getPrefix();
         $bVersion = parent::getParameterValue('bio2rdf_release');
         $date = parent::getDate(filemtime($odir . $ofile));
         $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/orphanet/orphanet.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
         $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
         if ($gz) {
             $output_file->setFormat("application/gzip");
         }
         if (strstr(parent::getParameterValue('output_format'), "nt")) {
             $output_file->setFormat("application/n-triples");
         } else {
             $output_file->setFormat("application/n-quads");
         }
         $dd .= $source_file->toRDF() . $output_file->toRDF();
     }
     //foreach
     parent::writeToReleaseFile($dd);
 }