public function Run() { $file = "iproclass.tb.gz"; $ldir = $this->GetParameterValue('indir'); $odir = $this->GetParameterValue('outdir'); $rdir = $this->GetParameterValue('download_url'); $lfile = $ldir . $file; if (!file_exists($lfile)) { trigger_error($lfile . " not found. Will attempt to download.", E_USER_NOTICE); parent::setParameterValue('download', true); } //download all files $rfile = $rdir . $file; if ($this->GetParameterValue('download') == true) { echo "downloading {$file}... "; utils::DownloadSingle($rfile, $lfile); // $cmd = "gzip -c $lfile | split -d -l 1000000 --filter='gzip > $FILE.gz' - iproclass-" } $ofile = "iproclass.nq"; $gz = true; parent::setReadFile($lfile, true); echo "processing {$file}... "; $this->process(); echo "done!" . PHP_EOL; parent::getWriteFile()->close(); echo "generating dataset release file... "; $source_file = (new DataResource($this))->setURI($rfile)->setTitle("iProClass")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/gzip")->setPublisher("http://pir.georgetown.edu")->setHomepage("http://pir.georgetown.edu/iproclass")->setRights("use-share-modify")->setLicense("http://pir.georgetown.edu/pirwww/about/linkpir.shtml")->setDataset("http://identifiers.org/iproclass/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/iproclass/iproclass.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description = $source_file->toRDF() . $output_file->toRDF(); parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); echo "done!" . PHP_EOL; }
function Run() { $dataset_description = ''; $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $list_file = $ldir . "ftp_list.txt"; if (!file_exists($list_file) || $this->getParameterValue('download') == true) { echo "Getting FTP file list ..."; $list = $this->getFtpFileList('ftp.ncbi.nlm.nih.gov', '/refseq/release/complete/', '/(complete\\.[0-9]+\\.protein\\.gpff\\.gz)/'); if (!isset($list) or count($list) == 0) { trigger_error("Unable to get list of files from FTP site. Check internet connection", E_USER_ERROR); exit(-1); } asort($list); $buf = implode("\n", $list); file_put_contents($list_file, $buf); echo "Done." . PHP_EOL; } else { echo "Using existing ftp list" . PHP_EOL; $list = explode("\n", file_get_contents($list_file)); } $counter = 1; $total = count($list); foreach ($list as $f) { $lfile = $ldir . $f; echo "Processing " . $counter++ . "/{$total} {$f}. "; if (!file_exists($lfile) || $this->getParameterValue('download') == true) { $rfile = parent::getParameterValue('download_url') . $f; echo "Downloading ..."; utils::DownloadSingle($rfile, $lfile); echo "done."; } else { echo "Using existing file."; } echo PHP_EOL; } //if download //iterate over the files $files = $this->getFilePaths($ldir, 'gz'); asort($files); foreach ($files as $f) { $lfile = $ldir . $f; $ofile = $odir . basename($f, ".gz") . "." . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false; parent::setWriteFile($ofile, $gz); parent::setReadFile($lfile, true); echo "processing {$f} ..."; $this->process(); parent::clear(); echo "done!" . PHP_EOL; $this->getReadFile()->close(); $this->getWriteFile()->close(); $source_file = (new DataResource($this))->setURI(parent::getParameterValue('download_url') . $lfile)->setTitle("NCBI RefSeq - {$f}")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat('text/refseq-format')->setFormat('application/zip')->setPublisher('http://www.ncbi.nlm.nih.gov')->setHomepage('http://www.ncbi.nlm.nih.gov/refseq')->setRights('use')->setRights('attribution')->setLicense('http://www.nlm.nih.gov/copyright.html')->setDataset(parent::getDatasetURI()); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} - {$f}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/refseq/refseq.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("restricted-by-source-license")->setLicense("http://creativecommons/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); $dataset_description .= $output_file->toRDF() . $source_file->toRDF(); } //for parent::writeToReleaseFile($dataset_description); parent::getWriteFile()->close(); }
function process() { $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $rdir = parent::getParameterValue('download_url'); //which files are to be converted? $files = trim($this->GetParameterValue('files')); if ($files == 'all') { $files = $this->getPackageMap(); } else { $sel_arr = explode(",", $files); $pm = $this->getPackageMap(); $files = array(); foreach ($sel_arr as $a) { if (array_key_exists($a, $pm)) { $files[$a] = $pm[$a]; } } } if ($this->getParameterValue('limit_organisms') == true) { $this->taxids = array_flip(explode(",", $this->getParameterValue('organisms'))); } //set dataset graph to be dataset URI $graph_uri = parent::getGraphURI(); if (parent::getParameterValue('dataset_graph') == true) { parent::setGraphURI(parent::getDatasetURI()); } $dataset_description = ''; //now iterate over the files array foreach ($files as $module => $rfilename) { $file = $module . ".gz"; $lfile = $ldir . $file; $rfile = $rdir . $rfilename; // download if (!file_exists($lfile) || parent::getParameterValue('download') == true) { trigger_error("{$lfile} not found. Will attempt to download.", E_USER_NOTICE); $myfile = $lfile; if ($module == "gene2sts" || $module == "gene2unigene") { $myfile = "compress.zlib://" . $lfile; } echo "downloading {$module} ..."; utils::DownloadSingle($rfile, $myfile); echo "done" . PHP_EOL; } } foreach ($files as $module => $rfilename) { $file = $module . ".gz"; $lfile = $ldir . $file; $rfile = $rdir . $rfilename; $ofile = $module . "." . parent::getParameterValue('output_format'); $gz = false; if (strstr(parent::getParameterValue('output_format'), "gz")) { $gz = true; } echo "Processing {$module} ... "; parent::setReadFile($lfile, true); parent::setWriteFile($odir . $ofile, $gz); $fnx = $module; if ($module == 'gene2refseq') { $fnx = 'gene2accession'; } $this->{$fnx}(); parent::clear(); echo 'done!' . PHP_EOL; parent::getReadFile()->close(); parent::getWriteFile()->close(); // generate the dataset release file // dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("NCBI Gene ({$module})")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/gzip")->setPublisher("http://www.ncbi.nlm.nih.gov")->setHomepage("http://www.ncbi.nlm.nih.gov/gene")->setRights("use-share-modify")->setLicense("http://www.ncbi.nlm.nih.gov/About/disclaimer.html")->setDataset("http://identifiers.org/ncbigene/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/ncbigene/ncbigene.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $source_file->toRDF() . $output_file->toRDF(); } //foreach //set graph URI back to default value parent::setGraphURI($graph_uri); //write dataset description to file echo "Generating dataset description... "; parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); echo "done!" . PHP_EOL; }
function run() { // get the file list if ($this->GetParameterValue('files') == 'all') { $files = explode("|", $this->GetParameterList('files')); array_shift($files); } else { $files = explode(",", $this->GetParameterValue('files')); } if ($this->getParameterValue('additional') != 'none') { $f = explode(",", $this->getParameterValue('additional')); $files = array_merge($files, $f); } $ldir = $this->GetParameterValue('indir'); $odir = $this->GetParameterValue('outdir'); $rdir = $this->GetParameterValue('download_url'); $dataset_description = ''; foreach ($files as $file) { $suffix = ".zip"; $lfile = $ldir . $file . $suffix; $rfile = $rdir . $file . $suffix; if ($file == "offsides" and !file_exists($lfile)) { echo "downloading twosides..."; $rfile = "http://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-offsides.zip"; utils::DownloadSingle($rfile, $lfile); echo "done" . PHP_EOL; } elseif ($file == "twosides" and !file_exists($lfile)) { echo "downloading {$file} ..."; $rfile = "http://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-twosides.zip"; utils::DownloadSingle($rfile, $lfile); echo "done" . PHP_EOL; } elseif ($file == 'annotations' or $file == 'relationships') { if (!file_exists($lfile)) { echo "Contact PharmGKB to get access to variants/clinical variants; save file as annotations.zip" . PHP_EOL; continue; } } else { if (!file_exists($lfile) or parent::getParameterValue('download') == true) { echo "Downloading {$lfile} ... "; Utils::DownloadSingle('https://www.pharmgkb.org/download.do?objId=' . $file . '.zip&dlCls=common', $lfile); echo "done" . PHP_EOL; } } // get a pointer to the file in the zip archive if (!file_exists($lfile)) { echo "no local copy of {$lfile} . skipping" . PHP_EOL; continue; } $zin = new ZipArchive(); if ($zin->open($lfile) === FALSE) { trigger_error("Unable to open {$lfile}"); exit; } $zipentries = array(); if ($file == "annotations") { // exclude: 'clinical_ann.tsv','study_parameters.tsv' $zipentries = array('clinical_ann_metadata.tsv', 'var_drug_ann.tsv', 'var_pheno_ann.tsv', 'var_fa_ann.tsv'); } else { if ($file == "pathways") { for ($i = 0; $i < $zin->numFiles; $i++) { $stat = $zin->statIndex($i); $entry = $stat['name']; $ext = pathinfo($entry, PATHINFO_EXTENSION); if ($ext != "txt") { $zipentries[] = $entry; } } } else { if ($file == "relationships") { $zipentries = array("relationships.tsv"); } else { if ($file == 'offsides') { $zipentries = array('3003377s-offsides.tsv'); } else { if ($file == 'twosides') { $zipentries = array('3003377s-twosides.tsv'); } else { $zipentries = array($file . ".tsv"); } } } } } // set the write file, parse, write and close $suffix = parent::getParameterValue('output_format'); $outfile = $file . '.' . $suffix; $gz = false; if (strstr(parent::getParameterValue('output_format'), "gz")) { $gz = true; } $this->SetWriteFile($odir . $outfile, $gz); foreach ($zipentries as $zipentry) { if (($fp = $zin->getStream($zipentry)) === FALSE) { trigger_error("Unable to get {$file}.tsv in ziparchive {$lfile}"); return FALSE; } $this->SetReadFile($lfile); $this->GetReadFile()->SetFilePointer($fp); if ($file == "annotations") { $fnx = substr($zipentry, 0, strpos($zipentry, ".tsv")); echo "processing {$zipentry}.."; } else { if ($file == 'pathways') { $fnx = 'pathways'; echo "processing {$fnx} ({$zipentry})... "; } else { $fnx = $file; echo "processing {$fnx} ... "; } } $this->{$fnx}(); parent::writeRDFBufferToWriteFile(); parent::clear(); echo "done!" . PHP_EOL; // generate the dataset release file $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Pharmacogenomics Knowledge Base ({$zipentry})")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/zip")->setPublisher("http://www.pharmgkb.org/")->setHomepage("http://www.pharmgkb.org/")->setRights("use")->setRights("no-commercial")->setLicense("http://www.pharmgkb.org/page/policies")->setDataset("http://identifiers.org/pharmgkb/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} {$file} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/pharmgkb/pharmgkb.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $source_file->toRDF() . $output_file->toRDF(); } $this->GetWriteFile()->Close(); } // foreach echo "Generating dataset description... "; parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); echo "done!" . PHP_EOL; }
function run() { if (parent::getParameterValue('files') == 'all') { $files = explode("|", $this->GetParameterList('files')); array_shift($files); } else { $files = explode(",", parent::getParameterValue('files')); } $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $rdir = parent::getParameterValue('download_url'); $dataset_description = ''; foreach ($files as $file) { $download = parent::getParameterValue('download'); $lfile = $ldir . "goa_" . $file . ".gz"; if (!file_exists($lfile) && $download == false) { trigger_error($lfile . " not found. Will attempt to download.", E_USER_NOTICE); $download = true; } //download file $rfile = $rdir . strtoupper($file) . "/gene_association.goa_" . $file . ".gz"; if ($download == true) { echo "downloading {$file} ... "; //file_put_contents($lfile,file_get_contents($rfile)); utils::DownloadSingle($rfile, $lfile); } $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; $ofile = "goa_" . $file . "." . parent::getParameterValue('output_format'); parent::setReadFile($lfile, TRUE); parent::setWriteFile($odir . $ofile, $gz); echo "processing {$file} ... "; $this->process($file); echo "done!"; parent::clear(); //close write file parent::getWriteFile()->close(); echo PHP_EOL; // dataset description $graph_uri = parent::getGraphURI(); if (parent::getParameterValue('dataset_graph') == true) { parent::setGraphURI(parent::getDatasetURI()); } $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Gene Ontology Annotation file {$file} ({$rfile}")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/gzip")->setPublisher("http://www.ebi.ac.uk/")->setHomepage("http://www.ebi.ac.uk/GOA/")->setRights("use")->setLicense("http://www.ebi.ac.uk/GOA/goaHelp.html")->setDataset("http://identifiers.org/goa/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/irefindex/irefindex.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $source_file->toRDF() . $output_file->toRDF(); parent::setGraphURI($graph_uri); } parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); echo "done!" . PHP_EOL; }
function run() { $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); // get the snps from pharmgkb $snps = explode(",", parent::getParameterValue('files')); if ($snps[0] == 'all') { $snps = $this->getSNPs(); } else { if ($snps[0] == 'clinical') { $snps = $this->getSNPs(true); } else { if ($snps[0] == 'omim') { $lfile = $ldir . 'snp_omimvar.txt'; if (!file_exists($lfile) || parent::getParameterValue('download') == true) { $ret = utils::DownloadSingle('ftp://ftp.ncbi.nlm.nih.gov/snp/Entrez/snp_omimvar.txt', $lfile); } $snps = $this->processOMIMVar($lfile); } else { if ($snps[0] == 'pharmgkb') { $lfile = $ldir . 'pharmgkb.snp.zip'; if (!file_exists($lfile) || parent::getParameterValue('download') == true) { $ret = utils::DownloadSingle('http://www.pharmgkb.org/download.do?objId=rsid.zip&dlCls=common', $lfile); } $snps = $this->processPharmGKBSnps($lfile); } } } } $outfile = $odir . "dbsnp." . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; parent::setWriteFile($outfile, $gz); $n = count($snps); $z = 0; foreach ($snps as $i => $snp) { $file = $snp . '.xml.gz'; $infile = $ldir . $file; $rfile = parent::getParameterValue('download_url') . $snp; //$outfile = $odir.$snp.".".parent::getParameterValue('output_format'); // check if exists $download = false; if (!file_exists($infile)) { //trigger_error($lfile." not found. Will attempt to download. ", E_USER_NOTICE); parent::setParameterValue('download', true); } // download if (parent::getParameterValue('download') == true) { trigger_error("Downloading {$file}", E_USER_NOTICE); $ret = utils::downloadSingle($rfile, "compress.zlib://" . $infile, true); if ($ret === false) { continue; } } // process echo "Processing {$snp} (" . ($i + 1) . "/{$n})" . PHP_EOL; $this->parse($infile); parent::writeRDFBufferToWriteFile(); if ($z++ % 10000 == 0) { parent::clear(); } } parent::getWriteFile()->close(); // generate the dataset description file $source_file = (new DataResource($this))->setURI($rfile)->setTitle("dbSNP " . parent::getDatasetVersion())->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z"))->setFormat("application/xml")->setPublisher("http://www.ncbi.nlm.nih.gov")->setHomepage("http://www.ncbi.nlm.nih.gov/SNP/")->setRights("use-share-modify")->setLicense("http://www.ncbi.nlm.nih.gov/About/disclaimer.html")->setDataset("http://identifiers.org/dbsnp/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/dbsnp/dbsnp.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description = $source_file->toRDF() . $output_file->toRDF(); parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); }