function run() { $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $dd = ''; $files = parent::getParameterValue('files'); if ($files == 'all') { $files = explode('|', parent::getParameterList('files')); array_shift($files); } else { $files = explode(',', parent::getParameterValue('files')); } foreach ($files as $file) { echo "processing {$file} ..."; $lfile = $ldir . $this->filemap[$file]; $rfile = parent::getParameterValue('download_url') . $this->filemap[$file]; if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') { $ret = utils::downloadSingle($rfile, $lfile); if ($ret === false) { echo "unable to download {$file} ... skipping" . PHP_EOL; continue; } } parent::setReadFile($lfile, true); $suffix = parent::getParameterValue('output_format'); $ofile = "orphanet-" . $file . '.' . $suffix; $gz = strstr(parent::getParameterValue('output_format'), "gz") ? $gz = true : ($gz = false); /* parent::setWriteFile($odir.$ofile, $gz); $this->$file($lfile); parent::getWriteFile()->close(); */ parent::getReadFile()->close(); parent::clear(); echo "done!" . PHP_EOL; // dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Orphanet: {$file}")->setRetrievedDate(parent::getDate(filemtime($lfile)))->setFormat("application/xml")->setPublisher("http://www.orpha.net")->setHomepage("http://www.orpha.net/")->setRights("use")->setRights("sharing-modified-version-needs-permission")->setLicense("http://creativecommons.org/licenses/by-nd/3.0/")->setDataset("http://identifiers.org/orphanet/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = parent::getDate(filemtime($odir . $ofile)); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/orphanet/orphanet.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dd .= $source_file->toRDF() . $output_file->toRDF(); } //foreach parent::writeToReleaseFile($dd); }
function Run() { $idir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $files = parent::getParameterValue('files'); if ($files == 'all') { $list = explode('|', parent::getParameterList('files')); array_shift($list); } else { $list = explode(',', parent::getParameterValue('files')); } $dataset_description = ''; foreach ($list as $item) { $lfile = $idir . $item . '.rpt'; $rfile = parent::getParameterValue('download_url') . $item . '.rpt'; if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') { echo "downloading {$item}..."; $ret = Utils::DownloadSingle($rfile, $lfile); if ($ret != true) { continue; } } parent::setReadFile($lfile, true); echo "Processing {$item}..."; $ofile = $odir . $item . '.' . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false; parent::setWriteFile($ofile, $gz); $this->{$item}(); parent::getWriteFile()->close(); parent::getReadFile()->close(); echo "Done" . PHP_EOL; parent::clear(); $source_file = (new DataResource($this))->setURI($rfile)->setTitle("MGI {$item}")->setRetrievedDate(date("Y-m-d\\TH:i:s", filemtime($lfile)))->setFormat("text")->setPublisher("http://www.informatics.jax.org")->setHomepage("http://www.informatics.jax.org")->setRights("use")->setLicense("http://www.informatics.jax.org/mgihome/other/copyright.shtml")->setDataset("http://identifiers.org/mgi/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TH:i:s"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$item} in {$prefix}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/mgi/mgi.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $source_file->toRDF() . $output_file->toRDF(); } //foreach // generate the dataset release file $this->setWriteFile($odir . parent::getBio2RDFReleaseFile()); $this->getWriteFile()->write($dataset_description); $this->getWriteFile()->close(); echo "done!" . PHP_EOL; }
function process_dir() { $this->setCheckPoint('dataset'); $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $this->id_list = null; if (parent::getParameterValue('id_list') != '') { $this->id_list = array_flip(explode(",", trim(parent::getParameterValue("id_list")))); } $graph_uri = parent::getGraphURI(); $dataset_description = ''; $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; //set graph URI to dataset graph if (parent::getParameterValue('dataset_graph') == true) { parent::setGraphURI(parent::getDatasetURI()); } $files = glob($ldir . "*.xml.gz"); foreach ($files as $i => $file) { echo "Processing {$file} (" . ($i + 1) . "/" . count($files) . ") ..."; $this->process_file($file); parent::clear(); echo "done!" . PHP_EOL; } $source_file = (new DataResource($this))->setURI("http://www.ncbi.nlm.nih.gov/pubmed")->setTitle("NCBI PubMed")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($ldir)))->setFormat("text/xml")->setPublisher("http://ncbi.nlm.nih.gov/")->setHomepage("http://www.ncbi.nlm.nih.gov/pubmed/")->setRights("use-share-modify")->setLicense("http://www.nlm.nih.gov/databases/license/license.html")->setDataset("http://identifiers.org/pubmed/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/pubmed/pubmed.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $source_file->toRDF() . $output_file->toRDF(); //set graph URI back to default parent::setGraphURI($graph_uri); // write the dataset description $this->setWriteFile($odir . $this->getBio2RDFReleaseFile()); $this->getWriteFile()->write($dataset_description); $this->getWriteFile()->close(); }
/** parse directory of files */ function parse_dir() { $ignore = array("..", '.', '.DS_STORE', "0"); $this->setCheckPoint('dataset'); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $dataset_file = parent::getParameterValue("outdir") . parent::getBio2RDFReleaseFile(); $fp = fopen($dataset_file, "w"); if ($fp === FALSE) { trigger_error("Unable to open {$dataset_file}", E_USER_ERROR); return false; } $ids = explode(",", parent::getParameterValue('id_list')); $indir = parent::getParameterValue('indir'); echo "Processing {$indir}\n"; $outfile = "clinicaltrials." . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; parent::setWriteFile(parent::getParameterValue("outdir") . $outfile, $gz); $files = glob($indir . "NCT*"); foreach ($files as $i => $file) { if ($i % 10000 == 0) { parent::clear(); } $trial_id = basename($file, '.xml'); if (parent::getParameterValue('id_list') == '' || in_array($trial_id, $ids)) { if (filesize($file) != 0) { echo "Processing {$trial_id}" . PHP_EOL; $this->process_file($file); } else { echo "Processing {$trial_id} -> Empty!" . PHP_EOL; } } } echo "Finished." . PHP_EOL; parent::getWriteFile()->close(); // make the dataset description parent::setGraphURI(parent::getDatasetURI()); $rfile = "http://clinicaltrials.gov/ct2/show/NCT_ID?resultsxml=true"; $source_version = parent::getDatasetVersion(); // dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Clinicaltrials")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($file)))->setFormat("application/xml")->setPublisher("http://clinicaltrials.gov/")->setHomepage("http://clinicaltrials.gov/")->setRights("use")->setRights("by-attribution")->setLicense("http://clinicaltrials.gov/ct2/about-site/terms-conditions")->setDataset("http://identifiers.org/clinicaltrials/"); parent::writeToReleaseFile($source_file->toRDF()); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} v{$source_version}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/clinicaltrials/clinicaltrials.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } parent::writeToReleaseFile($output_file->toRDF()); parent::closeReleaseFile(); // write the dataset description file fclose($fp); }
function process() { // get the file list if (parent::getParameterValue('files') == 'all') { $files = explode("|", parent::getParameterList('files')); array_shift($files); } else { $files = explode(",", parent::getParameterValue('files')); } $dataset_description = ''; //set directory values $ldir = parent::getParameterValue('indir'); $rdir = parent::getParameterValue('download_url'); $odir = parent::getParameterValue('outdir'); $graph_uri = parent::getGraphURI(); if (parent::getParameterValue('dataset_graph') == true) { parent::setGraphURI(parent::getDatasetURI()); } $gz_suffix = ".gz"; foreach ($files as $file) { if ($file == 'chem_gene_ixn_types') { $suffix = '.tsv'; } else { if ($file == 'exposure_ontology') { $suffix = '.obo'; } else { $suffix = ".tsv.gz"; } } $lfile = $ldir . $file . $gz_suffix; $rfile = $rdir . 'CTD_' . $file . $suffix; if (!file_exists($lfile)) { trigger_error($lfile . " not found. Will attempt to download.", E_USER_NOTICE); if ($suffix == ".tsv.gz") { Utils::DownloadSingle($rfile, $lfile); } else { Utils::DownloadSingle($rfile, "compress.zlib://" . $lfile); } } $out_suffix = parent::getParameterValue('output_format'); $ofile = "ctd_" . $file . "." . $out_suffix; $gz = false; if (strstr(parent::getParameterValue('output_format'), "gz")) { $gz = true; } echo "Processing " . $file . " ..."; parent::setWriteFile($odir . $ofile, $gz); //set read file parent::setReadFile($lfile, TRUE); $fnx = "CTD_" . $file; $this->{$fnx}(); //close write file parent::getWriteFile()->close(); parent::clear(); echo "done!" . PHP_EOL; // generate the dataset release file echo "Generating dataset description... "; if ($file == "chemicals") { $dataset = "http://identifiers.org/ctd.chemical/"; } else { if ($file == "diseases") { $dataset = "http://identifiers.org/ctd.disease/"; } else { if ($file == "genes") { $dataset = "http://identifiers.org/ctd.gene/"; } else { $dataset = null; } } } // dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Comparative Toxicogenomics Database ({$file}.{$gz_suffix}")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/gzip")->setPublisher("http://ctdbase.org/")->setHomepage("http://ctdbase.org/")->setRights("use")->setRights("by-attribution")->setRights("no-commercial")->setLicense("http://ctdbase.org/about/legal.jsp")->setDataset($dataset); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/ctd/ctd.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $source_file->toRDF() . $output_file->toRDF(); } parent::setGraphURI($graph_uri); parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); echo "done!" . PHP_EOL; }
private function process() { $z = 0; $y = 1; while ($l = $this->getReadFile()->Read(200000)) { if ($z++ % 1000000 == 0) { echo $z . PHP_EOL; $odir = parent::getParameterValue('outdir'); $ofile = 'iproclass.' . $y++ . "." . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false; if (parent::getWriteFile() != null) { parent::getWriteFile()->close(); parent::clear(); } // generate a new file parent::setWriteFile($odir . $ofile, $gz); } $fields = explode("\t", $l); @($uniprot_acc = $fields[0]); @($uniprot = $fields[1]); @($gene = $fields[2]); @($refseq = $fields[3]); @($gi = $fields[4]); @($pdb = $fields[5]); @($pfam = $fields[6]); @($go = $fields[7]); @($pirsf = $fields[8]); @($ipi = $fields[9]); @($uniref_100 = $fields[10]); @($uniref_90 = $fields[11]); @($uniref_50 = $fields[12]); @($uniparc = $fields[13]); //skipping pir-psd because db no longer maintained @($ncbi_taxonomy = $fields[15]); @($mim = $fields[16]); @($unigene = $fields[17]); @($ensembl = $fields[18]); @($pubmed = $fields[19]); @($embl_genbank_ddbj = $fields[20]); @($embl_protein = trim($fields[21])); $id = $uniprot_acc; $id_res = $this->getNamespace() . $id; $id_label = "iproclass entry for uniprot:{$uniprot_acc}"; parent::addRDF(parent::triplify($id_res, $this->getVoc() . "x-uniprot", "uniprot:" . $uniprot_acc)); if (!empty($uniprot)) { $uniprot_ids = explode("; ", $uniprot); foreach ($uniprot_ids as $uniprot_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-uniprot", "uniprot:" . $uniprot_id)); } } if (!empty($gene)) { $gene_ids = explode("; ", $gene); foreach ($gene_ids as $gene_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ncbigene", "geneid:" . $gene_id)); } } if (!empty($refseq)) { $refseq_ids = explode("; ", $refseq); foreach ($refseq_ids as $refseq_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-refseq", "refseq:" . $refseq_id)); } } if (!empty($gi)) { $gi_ids = explode("; ", $gi); foreach ($gi_ids as $gi_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-gi", "gi:" . $gi_id)); } } if (!empty($pdb)) { $pdb_ids = explode("; ", $pdb); foreach ($pdb_ids as $pdb_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pdb", "pdb:" . $pdb_id)); } } if (!empty($pfam)) { $pfam_ids = explode("; ", $pfam); foreach ($pfam_ids as $pfam_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pfam", "pfam:" . $pfam_id)); } } if (!empty($go)) { $go_ids = explode("; ", $go); foreach ($go_ids as $go_id) { $go_id = substr($go_id, 3); parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-go", "go:" . $go_id)); } } if (!empty($pirsf)) { $pirsf_ids = explode("; ", $pirsf); foreach ($pirsf_ids as $pirsf_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pirsf", "pirsf:" . $pirsf_id)); } } if (!empty($ipi)) { $ipi_ids = explode("; ", $ipi); foreach ($ipi_ids as $ipi_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ipi", "ipi:" . $ipi_id)); } } if (!empty($uniref_100)) { $uniref_100_ids = explode("; ", $uniref_100); foreach ($uniref_100_ids as $uniref_100_id) { parent::AddRDF(parent::QQuadO_URL($id_res, "rdfs:seeAlso", "http://uniprot.org/uniref/" . $uniref_100_id)); } } if (!empty($uniref_90)) { $uniref_90_ids = explode("; ", $uniref_90); foreach ($uniref_90_ids as $uniref_90_id) { parent::AddRDF(parent::QQuadO_URL($id_res, "rdfs:seeAlso", "http://uniprot.org/uniref/" . $uniref_90_id)); } } if (!empty($uniref_50)) { $uniref_50_ids = explode("; ", $uniref_50); foreach ($uniref_50_ids as $uniref_50_id) { parent::AddRDF(parent::QQuadO_URL($id_res, "rdfs:seeAlso", "http://uniprot.org/uniref/" . $uniref_50_id)); } } if (!empty($uniparc)) { $uniparc_ids = explode("; ", $uniparc); foreach ($uniparc_ids as $uniparc_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-uniparc", "uniparc:" . $uniparc_id) . parent::QQuadO_URL($id_res, "rdfs:seeAlso", "http://uniprot.org/uniparc/" . $uniparc_id)); } } if (!empty($ncbi_taxonomy)) { $taxonomy_ids = explode("; ", $ncbi_taxonomy); foreach ($taxonomy_ids as $taxonomy_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-taxon", "taxon:" . $taxonomy_id)); } } if (!empty($mim)) { $mim_ids = explode("; ", $mim); foreach ($mim_ids as $mim_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-omim", "omim:" . $mim_id)); } } if (!empty($unigene)) { $unigene_ids = explode("; ", $unigene); foreach ($unigene_ids as $unigene_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-unigene", "unigene:" . $unigene_id)); } } if (!empty($ensembl)) { $ensembl_ids = explode("; ", $ensembl); foreach ($ensembl_ids as $ensembl_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ensembl", "ensembl:" . $ensembl_id)); } } if (!empty($pubmed)) { $pubmed_ids = explode("; ", $pubmed); foreach ($pubmed_ids as $pubmed_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pubmed", "pubmed:" . $pubmed_id)); } } if (!empty($embl_genbank_ddbj)) { $genbank_ids = explode("; ", $embl_genbank_ddbj); foreach ($genbank_ids as $genbank_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-genbank", "genbank:" . $genbank_id)); } } if (!empty($embl_protein)) { $embl_protein_ids = explode(";", $embl_protein); foreach ($embl_protein_ids as $embl_protein_id) { parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-genbank", "genbank:" . $embl_protein_id)); } } //write rdf to file $this->WriteRDFBufferToWriteFile(); } //while }
function Run() { $dataset_description = ''; $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $list_file = $ldir . "ftp_list.txt"; if (!file_exists($list_file) || $this->getParameterValue('download') == true) { echo "Getting FTP file list ..."; $list = $this->getFtpFileList('ftp.ncbi.nlm.nih.gov', '/refseq/release/complete/', '/(complete\\.[0-9]+\\.protein\\.gpff\\.gz)/'); if (!isset($list) or count($list) == 0) { trigger_error("Unable to get list of files from FTP site. Check internet connection", E_USER_ERROR); exit(-1); } asort($list); $buf = implode("\n", $list); file_put_contents($list_file, $buf); echo "Done." . PHP_EOL; } else { echo "Using existing ftp list" . PHP_EOL; $list = explode("\n", file_get_contents($list_file)); } $counter = 1; $total = count($list); foreach ($list as $f) { $lfile = $ldir . $f; echo "Processing " . $counter++ . "/{$total} {$f}. "; if (!file_exists($lfile) || $this->getParameterValue('download') == true) { $rfile = parent::getParameterValue('download_url') . $f; echo "Downloading ..."; utils::DownloadSingle($rfile, $lfile); echo "done."; } else { echo "Using existing file."; } echo PHP_EOL; } //if download //iterate over the files $files = $this->getFilePaths($ldir, 'gz'); asort($files); foreach ($files as $f) { $lfile = $ldir . $f; $ofile = $odir . basename($f, ".gz") . "." . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false; parent::setWriteFile($ofile, $gz); parent::setReadFile($lfile, true); echo "processing {$f} ..."; $this->process(); parent::clear(); echo "done!" . PHP_EOL; $this->getReadFile()->close(); $this->getWriteFile()->close(); $source_file = (new DataResource($this))->setURI(parent::getParameterValue('download_url') . $lfile)->setTitle("NCBI RefSeq - {$f}")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat('text/refseq-format')->setFormat('application/zip')->setPublisher('http://www.ncbi.nlm.nih.gov')->setHomepage('http://www.ncbi.nlm.nih.gov/refseq')->setRights('use')->setRights('attribution')->setLicense('http://www.nlm.nih.gov/copyright.html')->setDataset(parent::getDatasetURI()); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} - {$f}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/refseq/refseq.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("restricted-by-source-license")->setLicense("http://creativecommons/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); $dataset_description .= $output_file->toRDF() . $source_file->toRDF(); } //for parent::writeToReleaseFile($dataset_description); parent::getWriteFile()->close(); }
private function geneinfo() { $i = 1; $header = $this->GetReadFile()->Read(200000); while ($aLine = $this->GetReadFile()->Read(200000)) { if ($i++ % 1000 == 0) { parent::clear(); } $a = $splitLine = explode("\t", $aLine); if (count($splitLine) == 15) { $taxid = "taxon:" . trim($splitLine[0]); if (isset($this->taxids) and !isset($this->taxids[trim($splitLine[0])])) { continue; } $aGeneId = trim($splitLine[1]); $geneid = "ncbigene:" . trim($splitLine[1]); $symbol = addslashes(stripslashes(trim($splitLine[2]))); $symbolid = "symbol:{$symbol}"; $locusTag = trim($splitLine[3]); $symbols_arr = explode("|", $splitLine[4]); $dbxrefs_arr = explode("|", $splitLine[5]); $chromosome = trim($splitLine[6]); $map_location = trim($splitLine[7]); $description = addslashes(stripslashes(trim($splitLine[8]))); $type_of_gene = trim($splitLine[9]); $symbol_authority = addslashes(stripslashes(trim($splitLine[10]))); $symbol_auth_full_name = addslashes(stripslashes(trim($splitLine[11]))); $nomenclature_status = addslashes(stripslashes(trim($splitLine[12]))); $other_designations = addslashes(stripslashes(trim($splitLine[13]))); $mod_date = date_parse(trim($splitLine[14])); //check for a valid symbol if ($symbol != "NEWENTRY") { $this->AddRDF(parent::describeIndividual($geneid, "{$description} ({$symbolid}, {$taxid})", $this->getVoc() . "Gene") . parent::triplify($geneid, $this->getVoc() . "x-taxonomy", $taxid) . parent::triplifyString($geneid, $this->getVoc() . "symbol", $symbol) . parent::triplifyString($geneid, $this->getVoc() . "locus", addslashes(stripslashes($locusTag))) . parent::describeClass($this->getVoc() . "Gene", "NCBI Gene gene")); if ($type_of_gene != '-') { $this->AddRDF(parent::triplify($geneid, "rdf:type", $this->getVoc() . ucfirst($type_of_gene) . "-Gene") . parent::describeClass($this->getVoc() . ucfirst($type_of_gene) . "-Gene", ucfirst($type_of_gene) . " Gene")); } //symbol synonyms foreach ($symbols_arr as $s) { if ($s != "-") { $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "symbol-synonym", addslashes(stripslashes($s)))); } } //dbxrefs foreach ($dbxrefs_arr as $dbx) { if ($dbx != "-") { $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "dbxref", $dbx)); } } //chromosome if ($chromosome != "-") { $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "chromosome", $chromosome)); } //map location if ($map_location != "-") { $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "map-location", $map_location)); } //description if ($description != "-") { $this->AddRDF(parent::triplifyString($geneid, "dc:description", $description)); } //nomenclature authority if ($symbol_authority != "-") { $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "nomenclature-authority", $symbol_authority)); if ($symbol_auth_full_name != "-") { $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "nomenclature-authority-fullname", $symbol_auth_full_name)); } } //nomenclature status if ($nomenclature_status != "-") { $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "nomenclature-status", $nomenclature_status)); } //other designations if ($other_designations != "-") { foreach (explode("|", $other_designations) as $d) { $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "other-designation", $d)); } } //modification date if ($mod_date != "-") { $this->AddRDF(parent::triplifyString($geneid, $this->getVoc() . "modification-date", $mod_date["year"] . "-" . $mod_date["month"] . "-" . $mod_date["day"])); } } } parent::writeRDFBufferToWriteFile(); } // while }
function run() { // get the file list if ($this->GetParameterValue('files') == 'all') { $files = explode("|", $this->GetParameterList('files')); array_shift($files); } else { $files = explode(",", $this->GetParameterValue('files')); } if ($this->getParameterValue('additional') != 'none') { $f = explode(",", $this->getParameterValue('additional')); $files = array_merge($files, $f); } $ldir = $this->GetParameterValue('indir'); $odir = $this->GetParameterValue('outdir'); $rdir = $this->GetParameterValue('download_url'); $dataset_description = ''; foreach ($files as $file) { $suffix = ".zip"; $lfile = $ldir . $file . $suffix; $rfile = $rdir . $file . $suffix; if ($file == "offsides" and !file_exists($lfile)) { echo "downloading twosides..."; $rfile = "http://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-offsides.zip"; utils::DownloadSingle($rfile, $lfile); echo "done" . PHP_EOL; } elseif ($file == "twosides" and !file_exists($lfile)) { echo "downloading {$file} ..."; $rfile = "http://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-twosides.zip"; utils::DownloadSingle($rfile, $lfile); echo "done" . PHP_EOL; } elseif ($file == 'annotations' or $file == 'relationships') { if (!file_exists($lfile)) { echo "Contact PharmGKB to get access to variants/clinical variants; save file as annotations.zip" . PHP_EOL; continue; } } else { if (!file_exists($lfile) or parent::getParameterValue('download') == true) { echo "Downloading {$lfile} ... "; Utils::DownloadSingle('https://www.pharmgkb.org/download.do?objId=' . $file . '.zip&dlCls=common', $lfile); echo "done" . PHP_EOL; } } // get a pointer to the file in the zip archive if (!file_exists($lfile)) { echo "no local copy of {$lfile} . skipping" . PHP_EOL; continue; } $zin = new ZipArchive(); if ($zin->open($lfile) === FALSE) { trigger_error("Unable to open {$lfile}"); exit; } $zipentries = array(); if ($file == "annotations") { // exclude: 'clinical_ann.tsv','study_parameters.tsv' $zipentries = array('clinical_ann_metadata.tsv', 'var_drug_ann.tsv', 'var_pheno_ann.tsv', 'var_fa_ann.tsv'); } else { if ($file == "pathways") { for ($i = 0; $i < $zin->numFiles; $i++) { $stat = $zin->statIndex($i); $entry = $stat['name']; $ext = pathinfo($entry, PATHINFO_EXTENSION); if ($ext != "txt") { $zipentries[] = $entry; } } } else { if ($file == "relationships") { $zipentries = array("relationships.tsv"); } else { if ($file == 'offsides') { $zipentries = array('3003377s-offsides.tsv'); } else { if ($file == 'twosides') { $zipentries = array('3003377s-twosides.tsv'); } else { $zipentries = array($file . ".tsv"); } } } } } // set the write file, parse, write and close $suffix = parent::getParameterValue('output_format'); $outfile = $file . '.' . $suffix; $gz = false; if (strstr(parent::getParameterValue('output_format'), "gz")) { $gz = true; } $this->SetWriteFile($odir . $outfile, $gz); foreach ($zipentries as $zipentry) { if (($fp = $zin->getStream($zipentry)) === FALSE) { trigger_error("Unable to get {$file}.tsv in ziparchive {$lfile}"); return FALSE; } $this->SetReadFile($lfile); $this->GetReadFile()->SetFilePointer($fp); if ($file == "annotations") { $fnx = substr($zipentry, 0, strpos($zipentry, ".tsv")); echo "processing {$zipentry}.."; } else { if ($file == 'pathways') { $fnx = 'pathways'; echo "processing {$fnx} ({$zipentry})... "; } else { $fnx = $file; echo "processing {$fnx} ... "; } } $this->{$fnx}(); parent::writeRDFBufferToWriteFile(); parent::clear(); echo "done!" . PHP_EOL; // generate the dataset release file $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Pharmacogenomics Knowledge Base ({$zipentry})")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/zip")->setPublisher("http://www.pharmgkb.org/")->setHomepage("http://www.pharmgkb.org/")->setRights("use")->setRights("no-commercial")->setLicense("http://www.pharmgkb.org/page/policies")->setDataset("http://identifiers.org/pharmgkb/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} {$file} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/pharmgkb/pharmgkb.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $source_file->toRDF() . $output_file->toRDF(); } $this->GetWriteFile()->Close(); } // foreach echo "Generating dataset description... "; parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); echo "done!" . PHP_EOL; }
function Run() { // directory shortcuts $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); // get the listings page $url = trim(parent::getParameterValue('download_url')); $listing_file = $ldir . "probeset_list.html"; if (!file_exists($listing_file) || parent::getParameterValue("download") == "true") { echo "Downloading {$listing_file}" . PHP_EOL; Utils::DownloadSingle($url, $listing_file); } $listings = file_get_contents($listing_file); // make a list of the csv.zip files preg_match_all("/\"([^\"]+)\\.csv\\.zip\"/", $listings, $m); if (count($m[1]) == 0) { trigger_error("could not find any .csv.zip files in {$url}"); exit; } if (parent::getParameterValue("files") == 'all') { $myfiles = $m[1]; } else { $a = explode(",", parent::getParameterValue("files")); foreach ($a as $f) { $found = false; foreach ($m[1] as $n) { if (strstr($n, $f)) { $found = true; $myfiles[] = $n; break; } } if ($found === false) { echo "cannot find {$f} in list" . PHP_EOL; } } } if (!isset($myfiles)) { exit; } // nothing to do $dataset_description = ''; // set the write file $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; $outfile = 'affymetrix.' . parent::getParameterValue('output_format'); $this->setWriteFile($odir . $outfile, $gz); // iterate over the files foreach ($myfiles as $rfile) { $base_file = substr($rfile, strrpos($rfile, "/") + 1); $base_url = substr($rfile, 0, strrpos($rfile, "/")); // get and set the dataset version if (parent::getDatasetVersion() == null) { preg_match("/\\.na([0-9]{2})\\.annot/", $base_file, $m); if (isset($m[1])) { $this->setDatasetVersion($m[1]); } } if (parent::getDatasetVersion() != parent::getParameterValue('version')) { $base_file = str_replace("na" . parent::getDatasetVersion(), "na" . parent::getParameterValue('version'), $base_file); } $csv_file = $base_file . ".csv"; $zip_file = $csv_file . ".zip"; $lfile = $ldir . $zip_file; if (!file_exists($lfile)) { echo "skipping: {$lfile} does not exist" . PHP_EOL; continue; } echo "processing {$lfile}" . PHP_EOL; // open the zip file $zin = new ZipArchive(); if ($zin->open($lfile) === FALSE) { trigger_error("Unable to open {$lfile}"); exit; } if (($fp = $zin->getStream($csv_file)) === FALSE) { trigger_error("Unable to get {$csv_file} in ziparchive {$lfile}"); return FALSE; } parent::setReadFile($lfile); parent::getReadFile()->setFilePointer($fp); $this->parse($base_file); parent::getReadFile()->close(); parent::clear(); // dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Affymetrix Probeset: {$base_file}")->setRetrievedDate(parent::getDate(filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/zip")->setPublisher("http://affymetrix.com")->setHomepage("http://www.affymetrix.com/support/technical/annotationfilesmain.affx")->setRights("use")->setRights("no-commercial")->setRights("registration-required")->setLicense("http://www.affymetrix.com/about_affymetrix/legal/index.affx")->setDataset("http://identifiers.org/affy.probeset/"); $dataset_description .= $source_file->toRDF(); } $this->getWriteFile()->close(); // write the dataset description $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = parent::getDate(filemtime($odir . $outfile)); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/affymetrix/affymetrix.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $output_file->toRDF(); // write the dataset description $this->setWriteFile($odir . $this->getBio2RDFReleaseFile()); $this->getWriteFile()->write($dataset_description); $this->getWriteFile()->close(); return true; }
function process($file) { $z = 1; while ($l = parent::getReadFile()->read(100000)) { if ($z % 100000 == 0) { parent::clear(); } if ($l[0] == "!") { continue; } $fields = explode("\t", $l); if (count($fields) != 17) { trigger_error("Expected 17 columns, but found " . count($fields), E_USER_ERROR); return false; } //get the Go id $db = $fields[0]; $id = $fields[1]; $symbol = $fields[2]; $qualifier = $fields[3]; $goid = substr($fields[4], 3); $refs = $this->getDbReferences($fields[5]); $eco = $this->getEvidenceCodeLabelArr($fields[6]); $aspect = $this->getAspect($fields[8]); $label = $fields[9]; $synonyms = explode("|", $fields[10]); $taxid = $fields[12]; $date = $this->parseDate($fields[13]); $assignedBy = $fields[14]; //entity id $eid = $this->getdbURI($db, $id); if (!$eid) { print_r($fields); continue; } parent::addRDF(parent::describeIndividual($eid, $label, parent::getVoc() . "GO-Annotation") . parent::describeClass(parent::getVoc() . "GO-Annotation", "GO Annotation") . parent::triplifyString($eid, parent::getVoc() . "symbol", $symbol)); parent::addRDF(parent::triplify($eid, parent::getVoc() . "x-taxonomy", $taxid)); foreach ($synonyms as $s) { if (!empty($s)) { parent::addRDF(parent::triplifyString($eid, parent::getVoc() . "synonym", $s)); } } $rel = $aspect; if ($qualifier == 'NOT') { if ($aspect == 'process') { $rel = 'not-in-process'; } if ($aspect == 'function') { $rel = 'not-has-function'; } if ($aspect == 'component') { $rel = 'not-in-component'; } } parent::addRDF(parent::describeObjectProperty(parent::getVoc() . $rel, str_replace("-", " ", $rel)) . parent::triplify($eid, parent::getVoc() . $rel, "go:" . $goid)); $type = key($eco); $aid = parent::getRes() . $file . "_" . $z++; parent::addRDF(parent::describeObjectProperty(parent::getVoc() . "go-annotation", "GO annotation") . parent::triplify($eid, parent::getVoc() . "go-annotation", $aid)); $cat = parent::getRes() . md5($aspect); parent::addRDF(parent::describeIndividual($aid, "{$id}-go:{$goid} association", parent::getVoc() . "GO-Annotation") . parent::triplify($aid, parent::getVoc() . "target", $eid) . parent::triplify($aid, parent::getVoc() . "go-term", "go:" . $goid) . parent::triplify($aid, parent::getVoc() . "evidence", "eco:" . $eco[$type][1]) . parent::triplify($aid, parent::getVoc() . "go-category", $cat) . parent::describeClass($cat, $aspect) . parent::triplifyString($aid, parent::getVoc() . "assigned-by", $assignedBy)); if ($date != '') { parent::addRDF(parent::triplifyString($aid, parent::getVoc() . "entry-date", $date . "T00:00:00Z", "xsd:dateTime")); } foreach ($refs as $ref) { $b = explode(":", $ref); if ($b[0] == 'PMID') { parent::addRDF(parent::triplify($aid, parent::getVoc() . "article", "pubmed:" . $b[1])); } } //write RDF to file parent::writeRDFBufferToWriteFile(); } }
function run() { $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $dataset_description = ''; $files = parent::getParameterValue('files'); if ($files == 'all') { $files = explode('|', parent::getParameterList('files')); array_shift($files); } else { $files = explode(',', parent::getParameterValue('files')); } if (parent::getParameterValue('id_list') != '') { $this->idlist = explode(",", parent::getParameterValue("id_list")); } // handle genes separately if (in_array("genes", $files)) { $orgs = array("hsa"); //,"mmu","eco","dre","dme","ath","sce","ddi"); echo "processing genes" . PHP_EOL; $ofile = "kegg-genes." . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false; parent::setWriteFile($odir . $ofile, $gz); // get the list of genomes $lfile = $ldir . "genome.txt"; $rfile = parent::getParameterValue("download_url") . "list/genome"; if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') { $ret = utils::downloadSingle($rfile, $lfile); } $fp = fopen($lfile, "r"); while ($l = fgets($fp)) { $a = explode("\t", $l); $b = explode(", ", $a[1]); $org = $b[0]; if (!in_array($org, $orgs)) { continue; } // get the list of genes for this organims echo "processing {$org}" . PHP_EOL; $this->org = $org; // local variable $lfile = $ldir . $org . ".txt"; $rfile = parent::getParameterValue("download_url") . "list/{$org}"; if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') { $ret = utils::downloadSingle($rfile, $lfile); } parent::setReadFile($lfile, false); $this->process("gene"); parent::getReadFile()->close(); parent::clear(); $this->org = null; // add dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("KEGG: Gene")->setRetrievedDate(parent::getDate(filemtime($lfile)))->setFormat("text/plain")->setPublisher("http://www.kegg.jp/")->setHomepage("http://www.kegg.jp/")->setRights("use")->setRights("no-commercial")->setLicense("http://www.kegg.jp/kegg/legal.html")->setDataset("http://identifiers.org/kegg/"); $dataset_description .= $source_file->toRDF(); } fclose($fp); parent::getWriteFile()->close(); echo "done" . PHP_EOL; $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = parent::getDate(filemtime($odir . $ofile)); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} - Gene ")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/kegg/kegg.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $output_file->toRDF(); } // all other files foreach ($files as $db) { if ($db == "genes") { continue; } echo "processing {$db}" . PHP_EOL; $lfile = $ldir . $db . ".txt"; $rfile = parent::getParameterValue("download_url") . "list/{$db}"; if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') { echo "Downloading {$rfile} "; $ret = utils::downloadSingle($rfile, $lfile); if ($ret === false) { echo "unable to download {$file} ... skipping" . PHP_EOL; continue; } echo "done." . PHP_EOL; } // now for each list, get the individual entries $ofile = "kegg-{$db}." . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false; parent::setReadFile($lfile, false); parent::setWriteFile($odir . $ofile, $gz); $this->process($db); parent::getWriteFile()->close(); parent::getReadFile()->close(); parent::clear(); echo "done!" . PHP_EOL; // add dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("KEGG: {$db}")->setRetrievedDate(parent::getDate(filemtime($lfile)))->setFormat("text/plain")->setPublisher("http://www.kegg.jp/")->setHomepage("http://www.kegg.jp/")->setRights("use")->setRights("no-commercial")->setLicense("http://www.kegg.jp/kegg/legal.html")->setDataset("http://identifiers.org/kegg/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = parent::getDate(filemtime($odir . $ofile)); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} - {$db} ")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/kegg/kegg.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $source_file->toRDF() . $output_file->toRDF(); } // write the dataset description $this->setWriteFile($odir . $this->getBio2RDFReleaseFile()); $this->getWriteFile()->write($dataset_description); $this->getWriteFile()->close(); }
public function Run() { $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); // make sure we have the zip archive //which files are to be converted? $selectedPackage = trim(parent::getParameterValue('files')); if ($selectedPackage == 'all') { $files = $this->getPackageMap(); } else { $sel_arr = explode(",", $selectedPackage); $pm = $this->getPackageMap(); $files = array(); foreach ($sel_arr as $a) { if (array_key_exists($a, $pm)) { $files[$a] = $pm[$a]; } } } $dataset_description = ''; foreach ($files as $key => $value) { $lfile = $ldir . $value['filename']; if (!file_exists($lfile) && parent::getParameterValue('download') == false) { trigger_error($lfile . " not found. Will attempt to download.", E_USER_NOTICE); $this->SetParameterValue('download', true); } //download all files [except mapping file] if ($this->GetParameterValue('download') == true) { $rfile = $value["file_url"]; echo "downloading " . var_dump($value["file_url"]) . " ... "; utils::downloadSingle($rfile, $lfile); } if ($key == "taxdmp" || $key == "gi2taxid_protein" || $key == "gi2taxid_nucleotide") { //get the name of the zip archive $lfile = $value["filename"]; // make sure we have the zip archive $zinfile = $ldir . $lfile; $zin = new ZipArchive(); if ($zin->open($zinfile) === FALSE) { trigger_error("Unable to open {$zinfile}"); exit; } //now iterate over the files in the ziparchive $source_file = (new DataResource($this))->setURI($value['file_url'])->setTitle('NCBI Taxonomy - ' . $key)->setRetrievedDate(date("Y-m-d\\TH:i:sP", filemtime($ldir . $lfile)))->setFormat('text/tab-separated-value')->setFormat('application/zip')->setPublisher('http://www.ncbi.nlm.nih.gov')->setHomepage('http://www.ncbi.nlm.nih.gov/taxonomy')->setRights('use')->setRights('attribution')->setLicense('https://www.nlm.nih.gov/copyright.html')->setDataset(parent::getDatasetURI()); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TH:i:sP"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} - {$key}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/taxonomy/taxonomy.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("restricted-by-source-license")->setLicense("http://creativecommons/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); $dataset_description .= $output_file->toRDF() . $source_file->toRDF(); foreach ($value["contents"] as $k => $fn) { if ($k == "names" || $k == "nodes" || $k == "citations" || $k == "gencode" || $k == "division" || $k == "gi_taxid_prot" || $k == "gi_taxid_nucl") { //if($k !== 'citations') continue; $fpin = $zin->getStream($fn); if (!$fpin) { trigger_error("Unable to get pointer to {$fn} in {$zinfile}"); exit("failed\n"); } $gzoutfile = $odir . "taxonomy-{$k}" . "." . parent::getParameterValue('output_format'); //set the write file $gz = strstr(parent::getParameterValue('output_format'), 'gz') ? true : false; parent::setReadFile($ldir . $lfile); parent::getReadFile()->SetFilePointer($fpin); parent::setWriteFile($gzoutfile, $gz); echo "processing {$fn}...\n"; $this->{$k}(); $this->GetWriteFile()->Close(); echo "done!" . PHP_EOL; parent::clear(); } //if $k } //foreach } //if key taxdmp $this->setWriteFile($odir . $this->getBio2RDFReleaseFile()); $this->getWriteFile()->write($dataset_description); $this->getWriteFile()->close(); } }
function Run() { $indir = parent::getParameterValue('indir'); $outdir = parent::getParameterValue('outdir'); $download_url = parent::getParameterValue('download_url'); if (parent::getParameterValue('files') == 'all') { $files = explode("|", parent::getParameterList('files')); array_shift($files); } else { $files = explode("|", parent::getParameterValue('files')); } if (parent::getParameterValue("id_list")) { $this->id_list = array_flip(explode(",", parent::getParameterValue('id_list'))); } $dataset_description = ''; foreach ($files as $f) { if ($f == 'drugbank') { $file = 'drugbank.xml.zip'; $lname = 'drugbank'; } $fnx = 'parse_' . $f; $rfile = parent::getParameterValue('download_url') . $file; $lfile = parent::getParameterValue('indir') . $file; $cfile = $lname . "." . parent::getParameterValue('output_format'); // download if (!file_exists($lfile) || parent::getParameterValue('download') == true) { utils::downloadSingle($rfile, $lfile); } // setup the write $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; parent::setWriteFile($outdir . $cfile, $gz); echo $outdir . $cfile; if (file_exists($indir . $file)) { // call the parser echo "processing {$file} ..." . PHP_EOL; $this->{$fnx}($indir, $file); echo "done" . PHP_EOL; parent::clear(); } parent::getWriteFile()->close(); // dataset description $ouri = parent::getGraphURI(); parent::setGraphURI(parent::getDatasetURI()); $source_version = parent::getDatasetVersion(); $bVersion = parent::getParameterValue('bio2rdf_release'); $prefix = parent::getPrefix(); $date = date("Y-m-d\\TH:i:sP"); // dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("DrugBank ({$file})")->setRetrievedDate(date("Y-m-d\\TH:i:sP", filemtime($indir . $file)))->setFormat("application/xml")->setFormat("application/zip")->setPublisher("http://drugbank.ca")->setHomepage("http://drugbank.ca")->setRights("use")->setRights("by-attribution")->setRights("no-commercial")->setLicense("http://www.drugbank.ca/about")->setDataset("http://identifiers.org/drugbank/"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$cfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} v{$source_version}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/drugbank/drugbank.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } parent::writeToReleaseFile($source_file->toRDF() . $output_file->toRDF()); parent::setGraphURI($ouri); } parent::closeReleaseFile(); }
function run() { $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); // get the snps from pharmgkb $snps = explode(",", parent::getParameterValue('files')); if ($snps[0] == 'all') { $snps = $this->getSNPs(); } else { if ($snps[0] == 'clinical') { $snps = $this->getSNPs(true); } else { if ($snps[0] == 'omim') { $lfile = $ldir . 'snp_omimvar.txt'; if (!file_exists($lfile) || parent::getParameterValue('download') == true) { $ret = utils::DownloadSingle('ftp://ftp.ncbi.nlm.nih.gov/snp/Entrez/snp_omimvar.txt', $lfile); } $snps = $this->processOMIMVar($lfile); } else { if ($snps[0] == 'pharmgkb') { $lfile = $ldir . 'pharmgkb.snp.zip'; if (!file_exists($lfile) || parent::getParameterValue('download') == true) { $ret = utils::DownloadSingle('http://www.pharmgkb.org/download.do?objId=rsid.zip&dlCls=common', $lfile); } $snps = $this->processPharmGKBSnps($lfile); } } } } $outfile = $odir . "dbsnp." . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; parent::setWriteFile($outfile, $gz); $n = count($snps); $z = 0; foreach ($snps as $i => $snp) { $file = $snp . '.xml.gz'; $infile = $ldir . $file; $rfile = parent::getParameterValue('download_url') . $snp; //$outfile = $odir.$snp.".".parent::getParameterValue('output_format'); // check if exists $download = false; if (!file_exists($infile)) { //trigger_error($lfile." not found. Will attempt to download. ", E_USER_NOTICE); parent::setParameterValue('download', true); } // download if (parent::getParameterValue('download') == true) { trigger_error("Downloading {$file}", E_USER_NOTICE); $ret = utils::downloadSingle($rfile, "compress.zlib://" . $infile, true); if ($ret === false) { continue; } } // process echo "Processing {$snp} (" . ($i + 1) . "/{$n})" . PHP_EOL; $this->parse($infile); parent::writeRDFBufferToWriteFile(); if ($z++ % 10000 == 0) { parent::clear(); } } parent::getWriteFile()->close(); // generate the dataset description file $source_file = (new DataResource($this))->setURI($rfile)->setTitle("dbSNP " . parent::getDatasetVersion())->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z"))->setFormat("application/xml")->setPublisher("http://www.ncbi.nlm.nih.gov")->setHomepage("http://www.ncbi.nlm.nih.gov/SNP/")->setRights("use-share-modify")->setLicense("http://www.ncbi.nlm.nih.gov/About/disclaimer.html")->setDataset("http://identifiers.org/dbsnp/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/dbsnp/dbsnp.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description = $source_file->toRDF() . $output_file->toRDF(); parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); }
private function OWL2RDF($abbv) { $filename = parent::getReadFile()->getFilename(); $buf = file_get_contents("compress.zlib://" . $filename); $parser = ARC2::getRDFXMLParser('file://' . $filename); $parser->parse("http://bio2rdf.org/bioportal#", $buf); $triples = $parser->getTriples(); foreach ($triples as $i => $a) { $this->TriplifyMap($a, strtolower($abbv)); parent::writeRDFBufferToWriteFile(); } parent::clear(); }