function Run() { $file = "hgnc_complete_set.txt.gz"; $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $rdir = parent::getParameterValue('download_url'); $lfile = $ldir . $file; if (!file_exists($lfile) && parent::getParameterValue('download') == false) { trigger_error($lfile . " not found. Will attempt to download.", E_USER_NOTICE); parent::setParameterValue('download', true); } //download the hgnc file $rfile = null; if (parent::getParameterValue('download') == true) { $rfile = $rdir; echo "downloading {$file} ... "; Utils::DownloadSingle($rfile, $lfile); } $ofile = $odir . "hgnc." . parent::getParameterValue('output_format'); $gz = false; if (strstr(parent::getParameterValue('output_format'), "gz")) { $gz = true; } parent::setWriteFile($ofile, $gz); parent::setReadFile($lfile, true); echo "processing {$file}... "; $this->process(); echo "done!" . PHP_EOL; //close write file parent::getWriteFile()->close(); echo PHP_EOL; // generate the dataset release file echo "generating dataset release file... "; $dataset_description = ''; $source_file = (new DataResource($this))->setURI($rdir)->setTitle('HUGO Gene Nomenclature Committee (HGNC)')->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat('text/tab-separated-value')->setFormat('application/zip')->setPublisher('http://www.genenames.org/')->setHomepage('http://www.genenames.org/data/gdlw_columndef.html')->setRights('use')->setRights('attribution')->setLicense('http://www.genenames.org/about/overview')->setDataset(parent::getDatasetURI()); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/hgnc/hgnc.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("restricted-by-source-license")->setLicense("http://creativecommons/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); parent::getWriteFile()->close(); echo "done!" . PHP_EOL; if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $source_file->toRDF() . $output_file->toRDF(); $this->setWriteFile($odir . $this->getBio2RDFReleaseFile()); $this->getWriteFile()->write($dataset_description); $this->getWriteFile()->close(); }
function Run() { $file = "homologene.data"; $ldir = $this->GetParameterValue('indir'); $odir = $this->GetParameterValue('outdir'); $rdir = $this->GetParameterValue('download_url'); $lfile = $ldir . $file; if (!file_exists($lfile)) { trigger_error($file . " not found. Will attempt to download.", E_USER_NOTICE); parent::setParameterValue('download', true); } //download $rfile = $rdir . $file; if ($this->GetParameterValue('download') == true) { echo "downloading {$file} ... "; utils::downloadSingle($rfile, $lfile); } $ofile = 'homologene.' . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), "gz") ? $gz = true : ($gz = false); parent::setReadFile($lfile); parent::setWriteFile($odir . $ofile, $gz); echo "processing {$file}... "; $this->process(); echo "done!" . PHP_EOL; parent::getWriteFile()->close(); // generate the dataset release file $source_file = (new DataResource($this))->setURI($rfile)->setTitle("NCBI Homologene")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat("text/tab-separated-value")->setPublisher("http://www.ncbi.nlm.nih.gov")->setHomepage("http://www.ncbi.nlm.nih.gov/homologene")->setRights("use-share-modify")->setLicense("http://www.ncbi.nlm.nih.gov/About/disclaimer.html")->setDataset("http://identifiers.org/homologene/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/homologene/homologene.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description = $source_file->toRDF() . $output_file->toRDF(); echo "Generating dataset description... "; parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); echo "done!" . PHP_EOL; }
public function Run() { $file = "iproclass.tb.gz"; $ldir = $this->GetParameterValue('indir'); $odir = $this->GetParameterValue('outdir'); $rdir = $this->GetParameterValue('download_url'); $lfile = $ldir . $file; if (!file_exists($lfile)) { trigger_error($lfile . " not found. Will attempt to download.", E_USER_NOTICE); parent::setParameterValue('download', true); } //download all files $rfile = $rdir . $file; if ($this->GetParameterValue('download') == true) { echo "downloading {$file}... "; utils::DownloadSingle($rfile, $lfile); // $cmd = "gzip -c $lfile | split -d -l 1000000 --filter='gzip > $FILE.gz' - iproclass-" } $ofile = "iproclass.nq"; $gz = true; parent::setReadFile($lfile, true); echo "processing {$file}... "; $this->process(); echo "done!" . PHP_EOL; parent::getWriteFile()->close(); echo "generating dataset release file... "; $source_file = (new DataResource($this))->setURI($rfile)->setTitle("iProClass")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/gzip")->setPublisher("http://pir.georgetown.edu")->setHomepage("http://pir.georgetown.edu/iproclass")->setRights("use-share-modify")->setLicense("http://pir.georgetown.edu/pirwww/about/linkpir.shtml")->setDataset("http://identifiers.org/iproclass/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/iproclass/iproclass.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description = $source_file->toRDF() . $output_file->toRDF(); parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); echo "done!" . PHP_EOL; }
function Run() { // directory shortcuts $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); if (parent::getParameterValue('omim_api_key') == '') { $key_file = parent::getParameterValue('omim_api_key_file'); if (file_exists($key_file)) { $key = trim(file_get_contents($key_file)); if ($key) { parent::setParameterValue('omim_api_key', $key); } else { trigger_error("No API key found in the specified omim key file {$key_file}", E_USER_WARNING); } } else { trigger_error("No OMIM key has been provided either by commmand line or in the expected omim key file {$key_file}", E_USER_WARNING); } } // get the list of mim2gene entries $entries = $this->GetListOfEntries($ldir); // get the work specified $list = trim(parent::getParameterValue('files')); if ($list != 'all') { // check if a hyphenated list was provided if (($pos = strpos($list, "-")) !== FALSE) { $start_range = substr($list, 0, $pos); $end_range = substr($list, $pos + 1); // get the whole list $full_list = $this->GetListOfEntries($ldir); // now intersect foreach ($full_list as $e => $type) { if ($e >= $start_range && $e <= $end_range) { $myentries[$e] = $type; } } $entries = $myentries; } else { // for comma separated list $b = explode(",", parent::getParameterValue('files')); foreach ($b as $e) { $myentries[$e] = ''; } $entries = array_intersect_key($entries, $myentries); } } // set the write file $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; $outfile = 'omim.' . parent::getParameterValue('output_format'); parent::setWriteFile($odir . $outfile, $gz); // declare the mapping method types $this->get_method_type(null, true); // iterate over the entries $i = 0; $total = count($entries); foreach ($entries as $omim_id => $type) { echo "processing " . ++$i . " of {$total} - omim# "; $download_file = $ldir . $omim_id . ".json.gz"; $gzfile = "compress.zlib://{$download_file}"; // download if the file doesn't exist or we are told to if (!file_exists($download_file) || parent::getParameterValue('download') == true) { // download using the api $url = parent::getParameterValue('omim_api_url') . '&apiKey=' . parent::getParameterValue('omim_api_key') . '&mimNumber=' . $omim_id; $buf = file_get_contents($url); if (strlen($buf) != 0) { file_put_contents($download_file, $buf); usleep(500000); // limit of 4 requests per second } } // load entry, parse and write to file $entry = json_decode(file_get_contents($gzfile), true); $omim_id = trim((string) $entry["omim"]["entryList"][0]["entry"]['mimNumber']); echo $omim_id; $this->ParseEntry($entry, $type); parent::writeRDFBufferToWriteFile(); echo PHP_EOL; } parent::writeRDFBufferToWriteFile(); parent::getWriteFile()->close(); // generate the dataset description file $source_file = (new DataResource($this))->setURI(parent::getParameterValue('omim_api_url'))->setTitle("OMIM " . parent::getDatasetVersion())->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z"))->setFormat("application/json")->setPublisher("http://omim.org")->setHomepage("http://omim.org")->setRights("use")->setRights("no-commercial")->setRights("registration-required")->setLicense("http://www.omim.org/help/agreement")->setDataset("http://identifiers.org/omim/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/omim/omim.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description = $source_file->toRDF() . $output_file->toRDF(); parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); return TRUE; }
function run() { $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); // get the snps from pharmgkb $snps = explode(",", parent::getParameterValue('files')); if ($snps[0] == 'all') { $snps = $this->getSNPs(); } else { if ($snps[0] == 'clinical') { $snps = $this->getSNPs(true); } else { if ($snps[0] == 'omim') { $lfile = $ldir . 'snp_omimvar.txt'; if (!file_exists($lfile) || parent::getParameterValue('download') == true) { $ret = utils::DownloadSingle('ftp://ftp.ncbi.nlm.nih.gov/snp/Entrez/snp_omimvar.txt', $lfile); } $snps = $this->processOMIMVar($lfile); } else { if ($snps[0] == 'pharmgkb') { $lfile = $ldir . 'pharmgkb.snp.zip'; if (!file_exists($lfile) || parent::getParameterValue('download') == true) { $ret = utils::DownloadSingle('http://www.pharmgkb.org/download.do?objId=rsid.zip&dlCls=common', $lfile); } $snps = $this->processPharmGKBSnps($lfile); } } } } $outfile = $odir . "dbsnp." . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; parent::setWriteFile($outfile, $gz); $n = count($snps); $z = 0; foreach ($snps as $i => $snp) { $file = $snp . '.xml.gz'; $infile = $ldir . $file; $rfile = parent::getParameterValue('download_url') . $snp; //$outfile = $odir.$snp.".".parent::getParameterValue('output_format'); // check if exists $download = false; if (!file_exists($infile)) { //trigger_error($lfile." not found. Will attempt to download. ", E_USER_NOTICE); parent::setParameterValue('download', true); } // download if (parent::getParameterValue('download') == true) { trigger_error("Downloading {$file}", E_USER_NOTICE); $ret = utils::downloadSingle($rfile, "compress.zlib://" . $infile, true); if ($ret === false) { continue; } } // process echo "Processing {$snp} (" . ($i + 1) . "/{$n})" . PHP_EOL; $this->parse($infile); parent::writeRDFBufferToWriteFile(); if ($z++ % 10000 == 0) { parent::clear(); } } parent::getWriteFile()->close(); // generate the dataset description file $source_file = (new DataResource($this))->setURI($rfile)->setTitle("dbSNP " . parent::getDatasetVersion())->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z"))->setFormat("application/xml")->setPublisher("http://www.ncbi.nlm.nih.gov")->setHomepage("http://www.ncbi.nlm.nih.gov/SNP/")->setRights("use-share-modify")->setLicense("http://www.ncbi.nlm.nih.gov/About/disclaimer.html")->setDataset("http://identifiers.org/dbsnp/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/dbsnp/dbsnp.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description = $source_file->toRDF() . $output_file->toRDF(); parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); }