Beispiel #1
0
 function run()
 {
     $ldir = parent::getParameterValue('indir');
     $odir = parent::getParameterValue('outdir');
     $dd = '';
     $files = parent::getParameterValue('files');
     if ($files == 'all') {
         $files = explode('|', parent::getParameterList('files'));
         array_shift($files);
     } else {
         $files = explode(',', parent::getParameterValue('files'));
     }
     foreach ($files as $file) {
         echo "processing {$file} ...";
         $lfile = $ldir . $this->filemap[$file];
         $rfile = parent::getParameterValue('download_url') . $this->filemap[$file];
         if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') {
             $ret = utils::downloadSingle($rfile, $lfile);
             if ($ret === false) {
                 echo "unable to download {$file} ... skipping" . PHP_EOL;
                 continue;
             }
         }
         parent::setReadFile($lfile, true);
         $suffix = parent::getParameterValue('output_format');
         $ofile = "orphanet-" . $file . '.' . $suffix;
         $gz = strstr(parent::getParameterValue('output_format'), "gz") ? $gz = true : ($gz = false);
         /*			parent::setWriteFile($odir.$ofile, $gz);
         			$this->$file($lfile);
         			parent::getWriteFile()->close();
         */
         parent::getReadFile()->close();
         parent::clear();
         echo "done!" . PHP_EOL;
         // dataset description
         $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Orphanet: {$file}")->setRetrievedDate(parent::getDate(filemtime($lfile)))->setFormat("application/xml")->setPublisher("http://www.orpha.net")->setHomepage("http://www.orpha.net/")->setRights("use")->setRights("sharing-modified-version-needs-permission")->setLicense("http://creativecommons.org/licenses/by-nd/3.0/")->setDataset("http://identifiers.org/orphanet/");
         $prefix = parent::getPrefix();
         $bVersion = parent::getParameterValue('bio2rdf_release');
         $date = parent::getDate(filemtime($odir . $ofile));
         $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/orphanet/orphanet.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
         $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
         if ($gz) {
             $output_file->setFormat("application/gzip");
         }
         if (strstr(parent::getParameterValue('output_format'), "nt")) {
             $output_file->setFormat("application/n-triples");
         } else {
             $output_file->setFormat("application/n-quads");
         }
         $dd .= $source_file->toRDF() . $output_file->toRDF();
     }
     //foreach
     parent::writeToReleaseFile($dd);
 }
Beispiel #2
0
 function Run()
 {
     $idir = parent::getParameterValue('indir');
     $odir = parent::getParameterValue('outdir');
     $files = parent::getParameterValue('files');
     if ($files == 'all') {
         $list = explode('|', parent::getParameterList('files'));
         array_shift($list);
     } else {
         $list = explode(',', parent::getParameterValue('files'));
     }
     $dataset_description = '';
     foreach ($list as $item) {
         $lfile = $idir . $item . '.rpt';
         $rfile = parent::getParameterValue('download_url') . $item . '.rpt';
         if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') {
             echo "downloading {$item}...";
             $ret = Utils::DownloadSingle($rfile, $lfile);
             if ($ret != true) {
                 continue;
             }
         }
         parent::setReadFile($lfile, true);
         echo "Processing {$item}...";
         $ofile = $odir . $item . '.' . parent::getParameterValue('output_format');
         $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false;
         parent::setWriteFile($ofile, $gz);
         $this->{$item}();
         parent::getWriteFile()->close();
         parent::getReadFile()->close();
         echo "Done" . PHP_EOL;
         parent::clear();
         $source_file = (new DataResource($this))->setURI($rfile)->setTitle("MGI {$item}")->setRetrievedDate(date("Y-m-d\\TH:i:s", filemtime($lfile)))->setFormat("text")->setPublisher("http://www.informatics.jax.org")->setHomepage("http://www.informatics.jax.org")->setRights("use")->setLicense("http://www.informatics.jax.org/mgihome/other/copyright.shtml")->setDataset("http://identifiers.org/mgi/");
         $prefix = parent::getPrefix();
         $bVersion = parent::getParameterValue('bio2rdf_release');
         $date = date("Y-m-d\\TH:i:s");
         $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$item} in {$prefix}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/mgi/mgi.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
         if ($gz) {
             $output_file->setFormat("application/gzip");
         }
         if (strstr(parent::getParameterValue('output_format'), "nt")) {
             $output_file->setFormat("application/n-triples");
         } else {
             $output_file->setFormat("application/n-quads");
         }
         $dataset_description .= $source_file->toRDF() . $output_file->toRDF();
     }
     //foreach
     // generate the dataset release file
     $this->setWriteFile($odir . parent::getBio2RDFReleaseFile());
     $this->getWriteFile()->write($dataset_description);
     $this->getWriteFile()->close();
     echo "done!" . PHP_EOL;
 }
Beispiel #3
0
 function process_dir()
 {
     $this->setCheckPoint('dataset');
     $ldir = parent::getParameterValue('indir');
     $odir = parent::getParameterValue('outdir');
     $this->id_list = null;
     if (parent::getParameterValue('id_list') != '') {
         $this->id_list = array_flip(explode(",", trim(parent::getParameterValue("id_list"))));
     }
     $graph_uri = parent::getGraphURI();
     $dataset_description = '';
     $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
     //set graph URI to dataset graph
     if (parent::getParameterValue('dataset_graph') == true) {
         parent::setGraphURI(parent::getDatasetURI());
     }
     $files = glob($ldir . "*.xml.gz");
     foreach ($files as $i => $file) {
         echo "Processing {$file} (" . ($i + 1) . "/" . count($files) . ") ...";
         $this->process_file($file);
         parent::clear();
         echo "done!" . PHP_EOL;
     }
     $source_file = (new DataResource($this))->setURI("http://www.ncbi.nlm.nih.gov/pubmed")->setTitle("NCBI PubMed")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($ldir)))->setFormat("text/xml")->setPublisher("http://ncbi.nlm.nih.gov/")->setHomepage("http://www.ncbi.nlm.nih.gov/pubmed/")->setRights("use-share-modify")->setLicense("http://www.nlm.nih.gov/databases/license/license.html")->setDataset("http://identifiers.org/pubmed/");
     $prefix = parent::getPrefix();
     $bVersion = parent::getParameterValue('bio2rdf_release');
     $date = date("Y-m-d\\TG:i:s\\Z");
     $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/pubmed/pubmed.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
     if ($gz) {
         $output_file->setFormat("application/gzip");
     }
     if (strstr(parent::getParameterValue('output_format'), "nt")) {
         $output_file->setFormat("application/n-triples");
     } else {
         $output_file->setFormat("application/n-quads");
     }
     $dataset_description .= $source_file->toRDF() . $output_file->toRDF();
     //set graph URI back to default
     parent::setGraphURI($graph_uri);
     // write the dataset description
     $this->setWriteFile($odir . $this->getBio2RDFReleaseFile());
     $this->getWriteFile()->write($dataset_description);
     $this->getWriteFile()->close();
 }
 /** parse directory of files */
 function parse_dir()
 {
     $ignore = array("..", '.', '.DS_STORE', "0");
     $this->setCheckPoint('dataset');
     $prefix = parent::getPrefix();
     $bVersion = parent::getParameterValue('bio2rdf_release');
     $date = date("Y-m-d\\TG:i:s\\Z");
     $dataset_file = parent::getParameterValue("outdir") . parent::getBio2RDFReleaseFile();
     $fp = fopen($dataset_file, "w");
     if ($fp === FALSE) {
         trigger_error("Unable to open {$dataset_file}", E_USER_ERROR);
         return false;
     }
     $ids = explode(",", parent::getParameterValue('id_list'));
     $indir = parent::getParameterValue('indir');
     echo "Processing {$indir}\n";
     $outfile = "clinicaltrials." . parent::getParameterValue('output_format');
     $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
     parent::setWriteFile(parent::getParameterValue("outdir") . $outfile, $gz);
     $files = glob($indir . "NCT*");
     foreach ($files as $i => $file) {
         if ($i % 10000 == 0) {
             parent::clear();
         }
         $trial_id = basename($file, '.xml');
         if (parent::getParameterValue('id_list') == '' || in_array($trial_id, $ids)) {
             if (filesize($file) != 0) {
                 echo "Processing {$trial_id}" . PHP_EOL;
                 $this->process_file($file);
             } else {
                 echo "Processing {$trial_id} -> Empty!" . PHP_EOL;
             }
         }
     }
     echo "Finished." . PHP_EOL;
     parent::getWriteFile()->close();
     // make the dataset description
     parent::setGraphURI(parent::getDatasetURI());
     $rfile = "http://clinicaltrials.gov/ct2/show/NCT_ID?resultsxml=true";
     $source_version = parent::getDatasetVersion();
     // dataset description
     $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Clinicaltrials")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($file)))->setFormat("application/xml")->setPublisher("http://clinicaltrials.gov/")->setHomepage("http://clinicaltrials.gov/")->setRights("use")->setRights("by-attribution")->setLicense("http://clinicaltrials.gov/ct2/about-site/terms-conditions")->setDataset("http://identifiers.org/clinicaltrials/");
     parent::writeToReleaseFile($source_file->toRDF());
     $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} v{$source_version}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/clinicaltrials/clinicaltrials.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
     $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
     if ($gz) {
         $output_file->setFormat("application/gzip");
     }
     if (strstr(parent::getParameterValue('output_format'), "nt")) {
         $output_file->setFormat("application/n-triples");
     } else {
         $output_file->setFormat("application/n-quads");
     }
     parent::writeToReleaseFile($output_file->toRDF());
     parent::closeReleaseFile();
     // write the dataset description file
     fclose($fp);
 }
Beispiel #5
0
 function process()
 {
     // get the file list
     if (parent::getParameterValue('files') == 'all') {
         $files = explode("|", parent::getParameterList('files'));
         array_shift($files);
     } else {
         $files = explode(",", parent::getParameterValue('files'));
     }
     $dataset_description = '';
     //set directory values
     $ldir = parent::getParameterValue('indir');
     $rdir = parent::getParameterValue('download_url');
     $odir = parent::getParameterValue('outdir');
     $graph_uri = parent::getGraphURI();
     if (parent::getParameterValue('dataset_graph') == true) {
         parent::setGraphURI(parent::getDatasetURI());
     }
     $gz_suffix = ".gz";
     foreach ($files as $file) {
         if ($file == 'chem_gene_ixn_types') {
             $suffix = '.tsv';
         } else {
             if ($file == 'exposure_ontology') {
                 $suffix = '.obo';
             } else {
                 $suffix = ".tsv.gz";
             }
         }
         $lfile = $ldir . $file . $gz_suffix;
         $rfile = $rdir . 'CTD_' . $file . $suffix;
         if (!file_exists($lfile)) {
             trigger_error($lfile . " not found. Will attempt to download.", E_USER_NOTICE);
             if ($suffix == ".tsv.gz") {
                 Utils::DownloadSingle($rfile, $lfile);
             } else {
                 Utils::DownloadSingle($rfile, "compress.zlib://" . $lfile);
             }
         }
         $out_suffix = parent::getParameterValue('output_format');
         $ofile = "ctd_" . $file . "." . $out_suffix;
         $gz = false;
         if (strstr(parent::getParameterValue('output_format'), "gz")) {
             $gz = true;
         }
         echo "Processing " . $file . " ...";
         parent::setWriteFile($odir . $ofile, $gz);
         //set read file
         parent::setReadFile($lfile, TRUE);
         $fnx = "CTD_" . $file;
         $this->{$fnx}();
         //close write file
         parent::getWriteFile()->close();
         parent::clear();
         echo "done!" . PHP_EOL;
         // generate the dataset release file
         echo "Generating dataset description... ";
         if ($file == "chemicals") {
             $dataset = "http://identifiers.org/ctd.chemical/";
         } else {
             if ($file == "diseases") {
                 $dataset = "http://identifiers.org/ctd.disease/";
             } else {
                 if ($file == "genes") {
                     $dataset = "http://identifiers.org/ctd.gene/";
                 } else {
                     $dataset = null;
                 }
             }
         }
         // dataset description
         $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Comparative Toxicogenomics Database ({$file}.{$gz_suffix}")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/gzip")->setPublisher("http://ctdbase.org/")->setHomepage("http://ctdbase.org/")->setRights("use")->setRights("by-attribution")->setRights("no-commercial")->setLicense("http://ctdbase.org/about/legal.jsp")->setDataset($dataset);
         $prefix = parent::getPrefix();
         $bVersion = parent::getParameterValue('bio2rdf_release');
         $date = date("Y-m-d\\TG:i:s\\Z");
         $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/ctd/ctd.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
         if ($gz) {
             $output_file->setFormat("application/gzip");
         }
         if (strstr(parent::getParameterValue('output_format'), "nt")) {
             $output_file->setFormat("application/n-triples");
         } else {
             $output_file->setFormat("application/n-quads");
         }
         $dataset_description .= $source_file->toRDF() . $output_file->toRDF();
     }
     parent::setGraphURI($graph_uri);
     parent::setWriteFile($odir . parent::getBio2RDFReleaseFile());
     parent::getWriteFile()->write($dataset_description);
     parent::getWriteFile()->close();
     echo "done!" . PHP_EOL;
 }
Beispiel #6
0
 private function process()
 {
     $z = 0;
     $y = 1;
     while ($l = $this->getReadFile()->Read(200000)) {
         if ($z++ % 1000000 == 0) {
             echo $z . PHP_EOL;
             $odir = parent::getParameterValue('outdir');
             $ofile = 'iproclass.' . $y++ . "." . parent::getParameterValue('output_format');
             $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false;
             if (parent::getWriteFile() != null) {
                 parent::getWriteFile()->close();
                 parent::clear();
             }
             // generate a new file
             parent::setWriteFile($odir . $ofile, $gz);
         }
         $fields = explode("\t", $l);
         @($uniprot_acc = $fields[0]);
         @($uniprot = $fields[1]);
         @($gene = $fields[2]);
         @($refseq = $fields[3]);
         @($gi = $fields[4]);
         @($pdb = $fields[5]);
         @($pfam = $fields[6]);
         @($go = $fields[7]);
         @($pirsf = $fields[8]);
         @($ipi = $fields[9]);
         @($uniref_100 = $fields[10]);
         @($uniref_90 = $fields[11]);
         @($uniref_50 = $fields[12]);
         @($uniparc = $fields[13]);
         //skipping pir-psd because db no longer maintained
         @($ncbi_taxonomy = $fields[15]);
         @($mim = $fields[16]);
         @($unigene = $fields[17]);
         @($ensembl = $fields[18]);
         @($pubmed = $fields[19]);
         @($embl_genbank_ddbj = $fields[20]);
         @($embl_protein = trim($fields[21]));
         $id = $uniprot_acc;
         $id_res = $this->getNamespace() . $id;
         $id_label = "iproclass entry for uniprot:{$uniprot_acc}";
         parent::addRDF(parent::triplify($id_res, $this->getVoc() . "x-uniprot", "uniprot:" . $uniprot_acc));
         if (!empty($uniprot)) {
             $uniprot_ids = explode("; ", $uniprot);
             foreach ($uniprot_ids as $uniprot_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-uniprot", "uniprot:" . $uniprot_id));
             }
         }
         if (!empty($gene)) {
             $gene_ids = explode("; ", $gene);
             foreach ($gene_ids as $gene_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ncbigene", "geneid:" . $gene_id));
             }
         }
         if (!empty($refseq)) {
             $refseq_ids = explode("; ", $refseq);
             foreach ($refseq_ids as $refseq_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-refseq", "refseq:" . $refseq_id));
             }
         }
         if (!empty($gi)) {
             $gi_ids = explode("; ", $gi);
             foreach ($gi_ids as $gi_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-gi", "gi:" . $gi_id));
             }
         }
         if (!empty($pdb)) {
             $pdb_ids = explode("; ", $pdb);
             foreach ($pdb_ids as $pdb_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pdb", "pdb:" . $pdb_id));
             }
         }
         if (!empty($pfam)) {
             $pfam_ids = explode("; ", $pfam);
             foreach ($pfam_ids as $pfam_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pfam", "pfam:" . $pfam_id));
             }
         }
         if (!empty($go)) {
             $go_ids = explode("; ", $go);
             foreach ($go_ids as $go_id) {
                 $go_id = substr($go_id, 3);
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-go", "go:" . $go_id));
             }
         }
         if (!empty($pirsf)) {
             $pirsf_ids = explode("; ", $pirsf);
             foreach ($pirsf_ids as $pirsf_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pirsf", "pirsf:" . $pirsf_id));
             }
         }
         if (!empty($ipi)) {
             $ipi_ids = explode("; ", $ipi);
             foreach ($ipi_ids as $ipi_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ipi", "ipi:" . $ipi_id));
             }
         }
         if (!empty($uniref_100)) {
             $uniref_100_ids = explode("; ", $uniref_100);
             foreach ($uniref_100_ids as $uniref_100_id) {
                 parent::AddRDF(parent::QQuadO_URL($id_res, "rdfs:seeAlso", "http://uniprot.org/uniref/" . $uniref_100_id));
             }
         }
         if (!empty($uniref_90)) {
             $uniref_90_ids = explode("; ", $uniref_90);
             foreach ($uniref_90_ids as $uniref_90_id) {
                 parent::AddRDF(parent::QQuadO_URL($id_res, "rdfs:seeAlso", "http://uniprot.org/uniref/" . $uniref_90_id));
             }
         }
         if (!empty($uniref_50)) {
             $uniref_50_ids = explode("; ", $uniref_50);
             foreach ($uniref_50_ids as $uniref_50_id) {
                 parent::AddRDF(parent::QQuadO_URL($id_res, "rdfs:seeAlso", "http://uniprot.org/uniref/" . $uniref_50_id));
             }
         }
         if (!empty($uniparc)) {
             $uniparc_ids = explode("; ", $uniparc);
             foreach ($uniparc_ids as $uniparc_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-uniparc", "uniparc:" . $uniparc_id) . parent::QQuadO_URL($id_res, "rdfs:seeAlso", "http://uniprot.org/uniparc/" . $uniparc_id));
             }
         }
         if (!empty($ncbi_taxonomy)) {
             $taxonomy_ids = explode("; ", $ncbi_taxonomy);
             foreach ($taxonomy_ids as $taxonomy_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-taxon", "taxon:" . $taxonomy_id));
             }
         }
         if (!empty($mim)) {
             $mim_ids = explode("; ", $mim);
             foreach ($mim_ids as $mim_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-omim", "omim:" . $mim_id));
             }
         }
         if (!empty($unigene)) {
             $unigene_ids = explode("; ", $unigene);
             foreach ($unigene_ids as $unigene_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-unigene", "unigene:" . $unigene_id));
             }
         }
         if (!empty($ensembl)) {
             $ensembl_ids = explode("; ", $ensembl);
             foreach ($ensembl_ids as $ensembl_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ensembl", "ensembl:" . $ensembl_id));
             }
         }
         if (!empty($pubmed)) {
             $pubmed_ids = explode("; ", $pubmed);
             foreach ($pubmed_ids as $pubmed_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pubmed", "pubmed:" . $pubmed_id));
             }
         }
         if (!empty($embl_genbank_ddbj)) {
             $genbank_ids = explode("; ", $embl_genbank_ddbj);
             foreach ($genbank_ids as $genbank_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-genbank", "genbank:" . $genbank_id));
             }
         }
         if (!empty($embl_protein)) {
             $embl_protein_ids = explode(";", $embl_protein);
             foreach ($embl_protein_ids as $embl_protein_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-genbank", "genbank:" . $embl_protein_id));
             }
         }
         //write rdf to file
         $this->WriteRDFBufferToWriteFile();
     }
     //while
 }
Beispiel #7
0
 function Run()
 {
     $dataset_description = '';
     $ldir = parent::getParameterValue('indir');
     $odir = parent::getParameterValue('outdir');
     $list_file = $ldir . "ftp_list.txt";
     if (!file_exists($list_file) || $this->getParameterValue('download') == true) {
         echo "Getting FTP file list ...";
         $list = $this->getFtpFileList('ftp.ncbi.nlm.nih.gov', '/refseq/release/complete/', '/(complete\\.[0-9]+\\.protein\\.gpff\\.gz)/');
         if (!isset($list) or count($list) == 0) {
             trigger_error("Unable to get list of files from FTP site. Check internet connection", E_USER_ERROR);
             exit(-1);
         }
         asort($list);
         $buf = implode("\n", $list);
         file_put_contents($list_file, $buf);
         echo "Done." . PHP_EOL;
     } else {
         echo "Using existing ftp list" . PHP_EOL;
         $list = explode("\n", file_get_contents($list_file));
     }
     $counter = 1;
     $total = count($list);
     foreach ($list as $f) {
         $lfile = $ldir . $f;
         echo "Processing " . $counter++ . "/{$total} {$f}. ";
         if (!file_exists($lfile) || $this->getParameterValue('download') == true) {
             $rfile = parent::getParameterValue('download_url') . $f;
             echo "Downloading ...";
             utils::DownloadSingle($rfile, $lfile);
             echo "done.";
         } else {
             echo "Using existing file.";
         }
         echo PHP_EOL;
     }
     //if download
     //iterate over the files
     $files = $this->getFilePaths($ldir, 'gz');
     asort($files);
     foreach ($files as $f) {
         $lfile = $ldir . $f;
         $ofile = $odir . basename($f, ".gz") . "." . parent::getParameterValue('output_format');
         $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false;
         parent::setWriteFile($ofile, $gz);
         parent::setReadFile($lfile, true);
         echo "processing {$f} ...";
         $this->process();
         parent::clear();
         echo "done!" . PHP_EOL;
         $this->getReadFile()->close();
         $this->getWriteFile()->close();
         $source_file = (new DataResource($this))->setURI(parent::getParameterValue('download_url') . $lfile)->setTitle("NCBI RefSeq - {$f}")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat('text/refseq-format')->setFormat('application/zip')->setPublisher('http://www.ncbi.nlm.nih.gov')->setHomepage('http://www.ncbi.nlm.nih.gov/refseq')->setRights('use')->setRights('attribution')->setLicense('http://www.nlm.nih.gov/copyright.html')->setDataset(parent::getDatasetURI());
         $prefix = parent::getPrefix();
         $bVersion = parent::getParameterValue('bio2rdf_release');
         $date = date("Y-m-d\\TG:i:s\\Z");
         $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} - {$f}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/refseq/refseq.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("restricted-by-source-license")->setLicense("http://creativecommons/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
         $dataset_description .= $output_file->toRDF() . $source_file->toRDF();
     }
     //for
     parent::writeToReleaseFile($dataset_description);
     parent::getWriteFile()->close();
 }
Beispiel #8
0
 private function geneinfo()
 {
     $i = 1;
     $header = $this->GetReadFile()->Read(200000);
     while ($aLine = $this->GetReadFile()->Read(200000)) {
         if ($i++ % 1000 == 0) {
             parent::clear();
         }
         $a = $splitLine = explode("\t", $aLine);
         if (count($splitLine) == 15) {
             $taxid = "taxon:" . trim($splitLine[0]);
             if (isset($this->taxids) and !isset($this->taxids[trim($splitLine[0])])) {
                 continue;
             }
             $aGeneId = trim($splitLine[1]);
             $geneid = "ncbigene:" . trim($splitLine[1]);
             $symbol = addslashes(stripslashes(trim($splitLine[2])));
             $symbolid = "symbol:{$symbol}";
             $locusTag = trim($splitLine[3]);
             $symbols_arr = explode("|", $splitLine[4]);
             $dbxrefs_arr = explode("|", $splitLine[5]);
             $chromosome = trim($splitLine[6]);
             $map_location = trim($splitLine[7]);
             $description = addslashes(stripslashes(trim($splitLine[8])));
             $type_of_gene = trim($splitLine[9]);
             $symbol_authority = addslashes(stripslashes(trim($splitLine[10])));
             $symbol_auth_full_name = addslashes(stripslashes(trim($splitLine[11])));
             $nomenclature_status = addslashes(stripslashes(trim($splitLine[12])));
             $other_designations = addslashes(stripslashes(trim($splitLine[13])));
             $mod_date = date_parse(trim($splitLine[14]));
             //check for a valid symbol
             if ($symbol != "NEWENTRY") {
                 $this->AddRDF(parent::describeIndividual($geneid, "{$description} ({$symbolid}, {$taxid})", $this->getVoc() . "Gene") . parent::triplify($geneid, $this->getVoc() . "x-taxonomy", $taxid) . parent::triplifyString($geneid, $this->getVoc() . "symbol", $symbol) . parent::triplifyString($geneid, $this->getVoc() . "locus", addslashes(stripslashes($locusTag))) . parent::describeClass($this->getVoc() . "Gene", "NCBI Gene gene"));
                 if ($type_of_gene != '-') {
                     $this->AddRDF(parent::triplify($geneid, "rdf:type", $this->getVoc() . ucfirst($type_of_gene) . "-Gene") . parent::describeClass($this->getVoc() . ucfirst($type_of_gene) . "-Gene", ucfirst($type_of_gene) . " Gene"));
                 }
                 //symbol synonyms
                 foreach ($symbols_arr as $s) {
                     if ($s != "-") {
                         $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "symbol-synonym", addslashes(stripslashes($s))));
                     }
                 }
                 //dbxrefs
                 foreach ($dbxrefs_arr as $dbx) {
                     if ($dbx != "-") {
                         $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "dbxref", $dbx));
                     }
                 }
                 //chromosome
                 if ($chromosome != "-") {
                     $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "chromosome", $chromosome));
                 }
                 //map location
                 if ($map_location != "-") {
                     $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "map-location", $map_location));
                 }
                 //description
                 if ($description != "-") {
                     $this->AddRDF(parent::triplifyString($geneid, "dc:description", $description));
                 }
                 //nomenclature authority
                 if ($symbol_authority != "-") {
                     $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "nomenclature-authority", $symbol_authority));
                     if ($symbol_auth_full_name != "-") {
                         $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "nomenclature-authority-fullname", $symbol_auth_full_name));
                     }
                 }
                 //nomenclature status
                 if ($nomenclature_status != "-") {
                     $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "nomenclature-status", $nomenclature_status));
                 }
                 //other designations
                 if ($other_designations != "-") {
                     foreach (explode("|", $other_designations) as $d) {
                         $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "other-designation", $d));
                     }
                 }
                 //modification date
                 if ($mod_date != "-") {
                     $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "modification-date", $mod_date["year"] . "-" . $mod_date["month"] . "-" . $mod_date["day"]));
                 }
             }
         }
         parent::writeRDFBufferToWriteFile();
     }
     // while
 }
Beispiel #9
0
 function run()
 {
     // get the file list
     if ($this->GetParameterValue('files') == 'all') {
         $files = explode("|", $this->GetParameterList('files'));
         array_shift($files);
     } else {
         $files = explode(",", $this->GetParameterValue('files'));
     }
     if ($this->getParameterValue('additional') != 'none') {
         $f = explode(",", $this->getParameterValue('additional'));
         $files = array_merge($files, $f);
     }
     $ldir = $this->GetParameterValue('indir');
     $odir = $this->GetParameterValue('outdir');
     $rdir = $this->GetParameterValue('download_url');
     $dataset_description = '';
     foreach ($files as $file) {
         $suffix = ".zip";
         $lfile = $ldir . $file . $suffix;
         $rfile = $rdir . $file . $suffix;
         if ($file == "offsides" and !file_exists($lfile)) {
             echo "downloading twosides...";
             $rfile = "http://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-offsides.zip";
             utils::DownloadSingle($rfile, $lfile);
             echo "done" . PHP_EOL;
         } elseif ($file == "twosides" and !file_exists($lfile)) {
             echo "downloading {$file} ...";
             $rfile = "http://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-twosides.zip";
             utils::DownloadSingle($rfile, $lfile);
             echo "done" . PHP_EOL;
         } elseif ($file == 'annotations' or $file == 'relationships') {
             if (!file_exists($lfile)) {
                 echo "Contact PharmGKB to get access to variants/clinical variants; save file as annotations.zip" . PHP_EOL;
                 continue;
             }
         } else {
             if (!file_exists($lfile) or parent::getParameterValue('download') == true) {
                 echo "Downloading {$lfile} ... ";
                 Utils::DownloadSingle('https://www.pharmgkb.org/download.do?objId=' . $file . '.zip&dlCls=common', $lfile);
                 echo "done" . PHP_EOL;
             }
         }
         // get a pointer to the file in the zip archive
         if (!file_exists($lfile)) {
             echo "no local copy of {$lfile} . skipping" . PHP_EOL;
             continue;
         }
         $zin = new ZipArchive();
         if ($zin->open($lfile) === FALSE) {
             trigger_error("Unable to open {$lfile}");
             exit;
         }
         $zipentries = array();
         if ($file == "annotations") {
             // exclude: 'clinical_ann.tsv','study_parameters.tsv'
             $zipentries = array('clinical_ann_metadata.tsv', 'var_drug_ann.tsv', 'var_pheno_ann.tsv', 'var_fa_ann.tsv');
         } else {
             if ($file == "pathways") {
                 for ($i = 0; $i < $zin->numFiles; $i++) {
                     $stat = $zin->statIndex($i);
                     $entry = $stat['name'];
                     $ext = pathinfo($entry, PATHINFO_EXTENSION);
                     if ($ext != "txt") {
                         $zipentries[] = $entry;
                     }
                 }
             } else {
                 if ($file == "relationships") {
                     $zipentries = array("relationships.tsv");
                 } else {
                     if ($file == 'offsides') {
                         $zipentries = array('3003377s-offsides.tsv');
                     } else {
                         if ($file == 'twosides') {
                             $zipentries = array('3003377s-twosides.tsv');
                         } else {
                             $zipentries = array($file . ".tsv");
                         }
                     }
                 }
             }
         }
         // set the write file, parse, write and close
         $suffix = parent::getParameterValue('output_format');
         $outfile = $file . '.' . $suffix;
         $gz = false;
         if (strstr(parent::getParameterValue('output_format'), "gz")) {
             $gz = true;
         }
         $this->SetWriteFile($odir . $outfile, $gz);
         foreach ($zipentries as $zipentry) {
             if (($fp = $zin->getStream($zipentry)) === FALSE) {
                 trigger_error("Unable to get {$file}.tsv in ziparchive {$lfile}");
                 return FALSE;
             }
             $this->SetReadFile($lfile);
             $this->GetReadFile()->SetFilePointer($fp);
             if ($file == "annotations") {
                 $fnx = substr($zipentry, 0, strpos($zipentry, ".tsv"));
                 echo "processing {$zipentry}..";
             } else {
                 if ($file == 'pathways') {
                     $fnx = 'pathways';
                     echo "processing {$fnx} ({$zipentry})... ";
                 } else {
                     $fnx = $file;
                     echo "processing {$fnx} ... ";
                 }
             }
             $this->{$fnx}();
             parent::writeRDFBufferToWriteFile();
             parent::clear();
             echo "done!" . PHP_EOL;
             // generate the dataset release file
             $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Pharmacogenomics Knowledge Base ({$zipentry})")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/zip")->setPublisher("http://www.pharmgkb.org/")->setHomepage("http://www.pharmgkb.org/")->setRights("use")->setRights("no-commercial")->setLicense("http://www.pharmgkb.org/page/policies")->setDataset("http://identifiers.org/pharmgkb/");
             $prefix = parent::getPrefix();
             $bVersion = parent::getParameterValue('bio2rdf_release');
             $date = date("Y-m-d\\TG:i:s\\Z");
             $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} {$file} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/pharmgkb/pharmgkb.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
             if ($gz) {
                 $output_file->setFormat("application/gzip");
             }
             if (strstr(parent::getParameterValue('output_format'), "nt")) {
                 $output_file->setFormat("application/n-triples");
             } else {
                 $output_file->setFormat("application/n-quads");
             }
             $dataset_description .= $source_file->toRDF() . $output_file->toRDF();
         }
         $this->GetWriteFile()->Close();
     }
     // foreach
     echo "Generating dataset description... ";
     parent::setWriteFile($odir . parent::getBio2RDFReleaseFile());
     parent::getWriteFile()->write($dataset_description);
     parent::getWriteFile()->close();
     echo "done!" . PHP_EOL;
 }
Beispiel #10
0
 function Run()
 {
     // directory shortcuts
     $ldir = parent::getParameterValue('indir');
     $odir = parent::getParameterValue('outdir');
     // get the listings page
     $url = trim(parent::getParameterValue('download_url'));
     $listing_file = $ldir . "probeset_list.html";
     if (!file_exists($listing_file) || parent::getParameterValue("download") == "true") {
         echo "Downloading {$listing_file}" . PHP_EOL;
         Utils::DownloadSingle($url, $listing_file);
     }
     $listings = file_get_contents($listing_file);
     // make a list of the csv.zip files
     preg_match_all("/\"([^\"]+)\\.csv\\.zip\"/", $listings, $m);
     if (count($m[1]) == 0) {
         trigger_error("could not find any .csv.zip files in {$url}");
         exit;
     }
     if (parent::getParameterValue("files") == 'all') {
         $myfiles = $m[1];
     } else {
         $a = explode(",", parent::getParameterValue("files"));
         foreach ($a as $f) {
             $found = false;
             foreach ($m[1] as $n) {
                 if (strstr($n, $f)) {
                     $found = true;
                     $myfiles[] = $n;
                     break;
                 }
             }
             if ($found === false) {
                 echo "cannot find {$f} in list" . PHP_EOL;
             }
         }
     }
     if (!isset($myfiles)) {
         exit;
     }
     // nothing to do
     $dataset_description = '';
     // set the write file
     $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
     $outfile = 'affymetrix.' . parent::getParameterValue('output_format');
     $this->setWriteFile($odir . $outfile, $gz);
     // iterate over the files
     foreach ($myfiles as $rfile) {
         $base_file = substr($rfile, strrpos($rfile, "/") + 1);
         $base_url = substr($rfile, 0, strrpos($rfile, "/"));
         // get and set the dataset version
         if (parent::getDatasetVersion() == null) {
             preg_match("/\\.na([0-9]{2})\\.annot/", $base_file, $m);
             if (isset($m[1])) {
                 $this->setDatasetVersion($m[1]);
             }
         }
         if (parent::getDatasetVersion() != parent::getParameterValue('version')) {
             $base_file = str_replace("na" . parent::getDatasetVersion(), "na" . parent::getParameterValue('version'), $base_file);
         }
         $csv_file = $base_file . ".csv";
         $zip_file = $csv_file . ".zip";
         $lfile = $ldir . $zip_file;
         if (!file_exists($lfile)) {
             echo "skipping: {$lfile} does not exist" . PHP_EOL;
             continue;
         }
         echo "processing {$lfile}" . PHP_EOL;
         // open the zip file
         $zin = new ZipArchive();
         if ($zin->open($lfile) === FALSE) {
             trigger_error("Unable to open {$lfile}");
             exit;
         }
         if (($fp = $zin->getStream($csv_file)) === FALSE) {
             trigger_error("Unable to get {$csv_file} in ziparchive {$lfile}");
             return FALSE;
         }
         parent::setReadFile($lfile);
         parent::getReadFile()->setFilePointer($fp);
         $this->parse($base_file);
         parent::getReadFile()->close();
         parent::clear();
         // dataset description
         $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Affymetrix Probeset: {$base_file}")->setRetrievedDate(parent::getDate(filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/zip")->setPublisher("http://affymetrix.com")->setHomepage("http://www.affymetrix.com/support/technical/annotationfilesmain.affx")->setRights("use")->setRights("no-commercial")->setRights("registration-required")->setLicense("http://www.affymetrix.com/about_affymetrix/legal/index.affx")->setDataset("http://identifiers.org/affy.probeset/");
         $dataset_description .= $source_file->toRDF();
     }
     $this->getWriteFile()->close();
     // write the dataset description
     $prefix = parent::getPrefix();
     $bVersion = parent::getParameterValue('bio2rdf_release');
     $date = parent::getDate(filemtime($odir . $outfile));
     $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/affymetrix/affymetrix.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
     if ($gz) {
         $output_file->setFormat("application/gzip");
     }
     if (strstr(parent::getParameterValue('output_format'), "nt")) {
         $output_file->setFormat("application/n-triples");
     } else {
         $output_file->setFormat("application/n-quads");
     }
     $dataset_description .= $output_file->toRDF();
     // write the dataset description
     $this->setWriteFile($odir . $this->getBio2RDFReleaseFile());
     $this->getWriteFile()->write($dataset_description);
     $this->getWriteFile()->close();
     return true;
 }
Beispiel #11
0
 function process($file)
 {
     $z = 1;
     while ($l = parent::getReadFile()->read(100000)) {
         if ($z % 100000 == 0) {
             parent::clear();
         }
         if ($l[0] == "!") {
             continue;
         }
         $fields = explode("\t", $l);
         if (count($fields) != 17) {
             trigger_error("Expected 17 columns, but found " . count($fields), E_USER_ERROR);
             return false;
         }
         //get the Go id
         $db = $fields[0];
         $id = $fields[1];
         $symbol = $fields[2];
         $qualifier = $fields[3];
         $goid = substr($fields[4], 3);
         $refs = $this->getDbReferences($fields[5]);
         $eco = $this->getEvidenceCodeLabelArr($fields[6]);
         $aspect = $this->getAspect($fields[8]);
         $label = $fields[9];
         $synonyms = explode("|", $fields[10]);
         $taxid = $fields[12];
         $date = $this->parseDate($fields[13]);
         $assignedBy = $fields[14];
         //entity id
         $eid = $this->getdbURI($db, $id);
         if (!$eid) {
             print_r($fields);
             continue;
         }
         parent::addRDF(parent::describeIndividual($eid, $label, parent::getVoc() . "GO-Annotation") . parent::describeClass(parent::getVoc() . "GO-Annotation", "GO Annotation") . parent::triplifyString($eid, parent::getVoc() . "symbol", $symbol));
         parent::addRDF(parent::triplify($eid, parent::getVoc() . "x-taxonomy", $taxid));
         foreach ($synonyms as $s) {
             if (!empty($s)) {
                 parent::addRDF(parent::triplifyString($eid, parent::getVoc() . "synonym", $s));
             }
         }
         $rel = $aspect;
         if ($qualifier == 'NOT') {
             if ($aspect == 'process') {
                 $rel = 'not-in-process';
             }
             if ($aspect == 'function') {
                 $rel = 'not-has-function';
             }
             if ($aspect == 'component') {
                 $rel = 'not-in-component';
             }
         }
         parent::addRDF(parent::describeObjectProperty(parent::getVoc() . $rel, str_replace("-", " ", $rel)) . parent::triplify($eid, parent::getVoc() . $rel, "go:" . $goid));
         $type = key($eco);
         $aid = parent::getRes() . $file . "_" . $z++;
         parent::addRDF(parent::describeObjectProperty(parent::getVoc() . "go-annotation", "GO annotation") . parent::triplify($eid, parent::getVoc() . "go-annotation", $aid));
         $cat = parent::getRes() . md5($aspect);
         parent::addRDF(parent::describeIndividual($aid, "{$id}-go:{$goid} association", parent::getVoc() . "GO-Annotation") . parent::triplify($aid, parent::getVoc() . "target", $eid) . parent::triplify($aid, parent::getVoc() . "go-term", "go:" . $goid) . parent::triplify($aid, parent::getVoc() . "evidence", "eco:" . $eco[$type][1]) . parent::triplify($aid, parent::getVoc() . "go-category", $cat) . parent::describeClass($cat, $aspect) . parent::triplifyString($aid, parent::getVoc() . "assigned-by", $assignedBy));
         if ($date != '') {
             parent::addRDF(parent::triplifyString($aid, parent::getVoc() . "entry-date", $date . "T00:00:00Z", "xsd:dateTime"));
         }
         foreach ($refs as $ref) {
             $b = explode(":", $ref);
             if ($b[0] == 'PMID') {
                 parent::addRDF(parent::triplify($aid, parent::getVoc() . "article", "pubmed:" . $b[1]));
             }
         }
         //write RDF to file
         parent::writeRDFBufferToWriteFile();
     }
 }
Beispiel #12
0
 function run()
 {
     $ldir = parent::getParameterValue('indir');
     $odir = parent::getParameterValue('outdir');
     $dataset_description = '';
     $files = parent::getParameterValue('files');
     if ($files == 'all') {
         $files = explode('|', parent::getParameterList('files'));
         array_shift($files);
     } else {
         $files = explode(',', parent::getParameterValue('files'));
     }
     if (parent::getParameterValue('id_list') != '') {
         $this->idlist = explode(",", parent::getParameterValue("id_list"));
     }
     // handle genes separately
     if (in_array("genes", $files)) {
         $orgs = array("hsa");
         //,"mmu","eco","dre","dme","ath","sce","ddi");
         echo "processing genes" . PHP_EOL;
         $ofile = "kegg-genes." . parent::getParameterValue('output_format');
         $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false;
         parent::setWriteFile($odir . $ofile, $gz);
         // get the list of genomes
         $lfile = $ldir . "genome.txt";
         $rfile = parent::getParameterValue("download_url") . "list/genome";
         if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') {
             $ret = utils::downloadSingle($rfile, $lfile);
         }
         $fp = fopen($lfile, "r");
         while ($l = fgets($fp)) {
             $a = explode("\t", $l);
             $b = explode(", ", $a[1]);
             $org = $b[0];
             if (!in_array($org, $orgs)) {
                 continue;
             }
             // get the list of genes for this organims
             echo "processing {$org}" . PHP_EOL;
             $this->org = $org;
             // local variable
             $lfile = $ldir . $org . ".txt";
             $rfile = parent::getParameterValue("download_url") . "list/{$org}";
             if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') {
                 $ret = utils::downloadSingle($rfile, $lfile);
             }
             parent::setReadFile($lfile, false);
             $this->process("gene");
             parent::getReadFile()->close();
             parent::clear();
             $this->org = null;
             // add dataset description
             $source_file = (new DataResource($this))->setURI($rfile)->setTitle("KEGG: Gene")->setRetrievedDate(parent::getDate(filemtime($lfile)))->setFormat("text/plain")->setPublisher("http://www.kegg.jp/")->setHomepage("http://www.kegg.jp/")->setRights("use")->setRights("no-commercial")->setLicense("http://www.kegg.jp/kegg/legal.html")->setDataset("http://identifiers.org/kegg/");
             $dataset_description .= $source_file->toRDF();
         }
         fclose($fp);
         parent::getWriteFile()->close();
         echo "done" . PHP_EOL;
         $prefix = parent::getPrefix();
         $bVersion = parent::getParameterValue('bio2rdf_release');
         $date = parent::getDate(filemtime($odir . $ofile));
         $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} - Gene ")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/kegg/kegg.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
         if ($gz) {
             $output_file->setFormat("application/gzip");
         }
         if (strstr(parent::getParameterValue('output_format'), "nt")) {
             $output_file->setFormat("application/n-triples");
         } else {
             $output_file->setFormat("application/n-quads");
         }
         $dataset_description .= $output_file->toRDF();
     }
     // all other files
     foreach ($files as $db) {
         if ($db == "genes") {
             continue;
         }
         echo "processing {$db}" . PHP_EOL;
         $lfile = $ldir . $db . ".txt";
         $rfile = parent::getParameterValue("download_url") . "list/{$db}";
         if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') {
             echo "Downloading {$rfile} ";
             $ret = utils::downloadSingle($rfile, $lfile);
             if ($ret === false) {
                 echo "unable to download {$file} ... skipping" . PHP_EOL;
                 continue;
             }
             echo "done." . PHP_EOL;
         }
         // now for each list, get the individual entries
         $ofile = "kegg-{$db}." . parent::getParameterValue('output_format');
         $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false;
         parent::setReadFile($lfile, false);
         parent::setWriteFile($odir . $ofile, $gz);
         $this->process($db);
         parent::getWriteFile()->close();
         parent::getReadFile()->close();
         parent::clear();
         echo "done!" . PHP_EOL;
         // add dataset description
         $source_file = (new DataResource($this))->setURI($rfile)->setTitle("KEGG: {$db}")->setRetrievedDate(parent::getDate(filemtime($lfile)))->setFormat("text/plain")->setPublisher("http://www.kegg.jp/")->setHomepage("http://www.kegg.jp/")->setRights("use")->setRights("no-commercial")->setLicense("http://www.kegg.jp/kegg/legal.html")->setDataset("http://identifiers.org/kegg/");
         $prefix = parent::getPrefix();
         $bVersion = parent::getParameterValue('bio2rdf_release');
         $date = parent::getDate(filemtime($odir . $ofile));
         $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} - {$db} ")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/kegg/kegg.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
         if ($gz) {
             $output_file->setFormat("application/gzip");
         }
         if (strstr(parent::getParameterValue('output_format'), "nt")) {
             $output_file->setFormat("application/n-triples");
         } else {
             $output_file->setFormat("application/n-quads");
         }
         $dataset_description .= $source_file->toRDF() . $output_file->toRDF();
     }
     // write the dataset description
     $this->setWriteFile($odir . $this->getBio2RDFReleaseFile());
     $this->getWriteFile()->write($dataset_description);
     $this->getWriteFile()->close();
 }
Beispiel #13
0
 public function Run()
 {
     $ldir = parent::getParameterValue('indir');
     $odir = parent::getParameterValue('outdir');
     // make sure we have the zip archive
     //which files are to be converted?
     $selectedPackage = trim(parent::getParameterValue('files'));
     if ($selectedPackage == 'all') {
         $files = $this->getPackageMap();
     } else {
         $sel_arr = explode(",", $selectedPackage);
         $pm = $this->getPackageMap();
         $files = array();
         foreach ($sel_arr as $a) {
             if (array_key_exists($a, $pm)) {
                 $files[$a] = $pm[$a];
             }
         }
     }
     $dataset_description = '';
     foreach ($files as $key => $value) {
         $lfile = $ldir . $value['filename'];
         if (!file_exists($lfile) && parent::getParameterValue('download') == false) {
             trigger_error($lfile . " not found. Will attempt to download.", E_USER_NOTICE);
             $this->SetParameterValue('download', true);
         }
         //download all files [except mapping file]
         if ($this->GetParameterValue('download') == true) {
             $rfile = $value["file_url"];
             echo "downloading " . var_dump($value["file_url"]) . " ... ";
             utils::downloadSingle($rfile, $lfile);
         }
         if ($key == "taxdmp" || $key == "gi2taxid_protein" || $key == "gi2taxid_nucleotide") {
             //get the name of the zip archive
             $lfile = $value["filename"];
             // make sure we have the zip archive
             $zinfile = $ldir . $lfile;
             $zin = new ZipArchive();
             if ($zin->open($zinfile) === FALSE) {
                 trigger_error("Unable to open {$zinfile}");
                 exit;
             }
             //now iterate over the files in the ziparchive
             $source_file = (new DataResource($this))->setURI($value['file_url'])->setTitle('NCBI Taxonomy - ' . $key)->setRetrievedDate(date("Y-m-d\\TH:i:sP", filemtime($ldir . $lfile)))->setFormat('text/tab-separated-value')->setFormat('application/zip')->setPublisher('http://www.ncbi.nlm.nih.gov')->setHomepage('http://www.ncbi.nlm.nih.gov/taxonomy')->setRights('use')->setRights('attribution')->setLicense('https://www.nlm.nih.gov/copyright.html')->setDataset(parent::getDatasetURI());
             $prefix = parent::getPrefix();
             $bVersion = parent::getParameterValue('bio2rdf_release');
             $date = date("Y-m-d\\TH:i:sP");
             $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} - {$key}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/taxonomy/taxonomy.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("restricted-by-source-license")->setLicense("http://creativecommons/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
             $dataset_description .= $output_file->toRDF() . $source_file->toRDF();
             foreach ($value["contents"] as $k => $fn) {
                 if ($k == "names" || $k == "nodes" || $k == "citations" || $k == "gencode" || $k == "division" || $k == "gi_taxid_prot" || $k == "gi_taxid_nucl") {
                     //if($k !== 'citations') continue;
                     $fpin = $zin->getStream($fn);
                     if (!$fpin) {
                         trigger_error("Unable to get pointer to {$fn} in {$zinfile}");
                         exit("failed\n");
                     }
                     $gzoutfile = $odir . "taxonomy-{$k}" . "." . parent::getParameterValue('output_format');
                     //set the write file
                     $gz = strstr(parent::getParameterValue('output_format'), 'gz') ? true : false;
                     parent::setReadFile($ldir . $lfile);
                     parent::getReadFile()->SetFilePointer($fpin);
                     parent::setWriteFile($gzoutfile, $gz);
                     echo "processing {$fn}...\n";
                     $this->{$k}();
                     $this->GetWriteFile()->Close();
                     echo "done!" . PHP_EOL;
                     parent::clear();
                 }
                 //if $k
             }
             //foreach
         }
         //if key taxdmp
         $this->setWriteFile($odir . $this->getBio2RDFReleaseFile());
         $this->getWriteFile()->write($dataset_description);
         $this->getWriteFile()->close();
     }
 }
Beispiel #14
0
 function Run()
 {
     $indir = parent::getParameterValue('indir');
     $outdir = parent::getParameterValue('outdir');
     $download_url = parent::getParameterValue('download_url');
     if (parent::getParameterValue('files') == 'all') {
         $files = explode("|", parent::getParameterList('files'));
         array_shift($files);
     } else {
         $files = explode("|", parent::getParameterValue('files'));
     }
     if (parent::getParameterValue("id_list")) {
         $this->id_list = array_flip(explode(",", parent::getParameterValue('id_list')));
     }
     $dataset_description = '';
     foreach ($files as $f) {
         if ($f == 'drugbank') {
             $file = 'drugbank.xml.zip';
             $lname = 'drugbank';
         }
         $fnx = 'parse_' . $f;
         $rfile = parent::getParameterValue('download_url') . $file;
         $lfile = parent::getParameterValue('indir') . $file;
         $cfile = $lname . "." . parent::getParameterValue('output_format');
         // download
         if (!file_exists($lfile) || parent::getParameterValue('download') == true) {
             utils::downloadSingle($rfile, $lfile);
         }
         // setup the write
         $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
         parent::setWriteFile($outdir . $cfile, $gz);
         echo $outdir . $cfile;
         if (file_exists($indir . $file)) {
             // call the parser
             echo "processing {$file} ..." . PHP_EOL;
             $this->{$fnx}($indir, $file);
             echo "done" . PHP_EOL;
             parent::clear();
         }
         parent::getWriteFile()->close();
         // dataset description
         $ouri = parent::getGraphURI();
         parent::setGraphURI(parent::getDatasetURI());
         $source_version = parent::getDatasetVersion();
         $bVersion = parent::getParameterValue('bio2rdf_release');
         $prefix = parent::getPrefix();
         $date = date("Y-m-d\\TH:i:sP");
         // dataset description
         $source_file = (new DataResource($this))->setURI($rfile)->setTitle("DrugBank ({$file})")->setRetrievedDate(date("Y-m-d\\TH:i:sP", filemtime($indir . $file)))->setFormat("application/xml")->setFormat("application/zip")->setPublisher("http://drugbank.ca")->setHomepage("http://drugbank.ca")->setRights("use")->setRights("by-attribution")->setRights("no-commercial")->setLicense("http://www.drugbank.ca/about")->setDataset("http://identifiers.org/drugbank/");
         $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$cfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} v{$source_version}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/drugbank/drugbank.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
         $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
         if ($gz) {
             $output_file->setFormat("application/gzip");
         }
         if (strstr(parent::getParameterValue('output_format'), "nt")) {
             $output_file->setFormat("application/n-triples");
         } else {
             $output_file->setFormat("application/n-quads");
         }
         parent::writeToReleaseFile($source_file->toRDF() . $output_file->toRDF());
         parent::setGraphURI($ouri);
     }
     parent::closeReleaseFile();
 }
Beispiel #15
0
 function run()
 {
     $ldir = parent::getParameterValue('indir');
     $odir = parent::getParameterValue('outdir');
     // get the snps from pharmgkb
     $snps = explode(",", parent::getParameterValue('files'));
     if ($snps[0] == 'all') {
         $snps = $this->getSNPs();
     } else {
         if ($snps[0] == 'clinical') {
             $snps = $this->getSNPs(true);
         } else {
             if ($snps[0] == 'omim') {
                 $lfile = $ldir . 'snp_omimvar.txt';
                 if (!file_exists($lfile) || parent::getParameterValue('download') == true) {
                     $ret = utils::DownloadSingle('ftp://ftp.ncbi.nlm.nih.gov/snp/Entrez/snp_omimvar.txt', $lfile);
                 }
                 $snps = $this->processOMIMVar($lfile);
             } else {
                 if ($snps[0] == 'pharmgkb') {
                     $lfile = $ldir . 'pharmgkb.snp.zip';
                     if (!file_exists($lfile) || parent::getParameterValue('download') == true) {
                         $ret = utils::DownloadSingle('http://www.pharmgkb.org/download.do?objId=rsid.zip&dlCls=common', $lfile);
                     }
                     $snps = $this->processPharmGKBSnps($lfile);
                 }
             }
         }
     }
     $outfile = $odir . "dbsnp." . parent::getParameterValue('output_format');
     $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true;
     parent::setWriteFile($outfile, $gz);
     $n = count($snps);
     $z = 0;
     foreach ($snps as $i => $snp) {
         $file = $snp . '.xml.gz';
         $infile = $ldir . $file;
         $rfile = parent::getParameterValue('download_url') . $snp;
         //$outfile = $odir.$snp.".".parent::getParameterValue('output_format');
         // check if exists
         $download = false;
         if (!file_exists($infile)) {
             //trigger_error($lfile." not found. Will attempt to download. ", E_USER_NOTICE);
             parent::setParameterValue('download', true);
         }
         // download
         if (parent::getParameterValue('download') == true) {
             trigger_error("Downloading {$file}", E_USER_NOTICE);
             $ret = utils::downloadSingle($rfile, "compress.zlib://" . $infile, true);
             if ($ret === false) {
                 continue;
             }
         }
         // process
         echo "Processing {$snp} (" . ($i + 1) . "/{$n})" . PHP_EOL;
         $this->parse($infile);
         parent::writeRDFBufferToWriteFile();
         if ($z++ % 10000 == 0) {
             parent::clear();
         }
     }
     parent::getWriteFile()->close();
     // generate the dataset description file
     $source_file = (new DataResource($this))->setURI($rfile)->setTitle("dbSNP " . parent::getDatasetVersion())->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z"))->setFormat("application/xml")->setPublisher("http://www.ncbi.nlm.nih.gov")->setHomepage("http://www.ncbi.nlm.nih.gov/SNP/")->setRights("use-share-modify")->setLicense("http://www.ncbi.nlm.nih.gov/About/disclaimer.html")->setDataset("http://identifiers.org/dbsnp/");
     $prefix = parent::getPrefix();
     $bVersion = parent::getParameterValue('bio2rdf_release');
     $date = date("Y-m-d\\TG:i:s\\Z");
     $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/dbsnp/dbsnp.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI());
     if ($gz) {
         $output_file->setFormat("application/gzip");
     }
     if (strstr(parent::getParameterValue('output_format'), "nt")) {
         $output_file->setFormat("application/n-triples");
     } else {
         $output_file->setFormat("application/n-quads");
     }
     $dataset_description = $source_file->toRDF() . $output_file->toRDF();
     parent::setWriteFile($odir . parent::getBio2RDFReleaseFile());
     parent::getWriteFile()->write($dataset_description);
     parent::getWriteFile()->close();
 }
Beispiel #16
0
 private function OWL2RDF($abbv)
 {
     $filename = parent::getReadFile()->getFilename();
     $buf = file_get_contents("compress.zlib://" . $filename);
     $parser = ARC2::getRDFXMLParser('file://' . $filename);
     $parser->parse("http://bio2rdf.org/bioportal#", $buf);
     $triples = $parser->getTriples();
     foreach ($triples as $i => $a) {
         $this->TriplifyMap($a, strtolower($abbv));
         parent::writeRDFBufferToWriteFile();
     }
     parent::clear();
 }