function Run() { // directory shortcuts $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); // get the listings page $rfile = trim(parent::getParameterValue('download_url')); $file = "interpro.xml.gz"; $lfile = $ldir . $file; if (!file_exists($lfile) || parent::getParameterValue("download") == "true") { echo "Downloading {$lfile}" . PHP_EOL; $ret = file_get_contents($rfile); if ($ret === FALSE) { trigger_error("unable to download {$rfile}"); exit; } file_put_contents($lfile, $ret); } echo "Loading XML file..."; $cxml = new CXML($lfile); $cxml->Parse(); $xml = $cxml->GetXMLRoot(); echo "Done" . PHP_EOL; // set the write file $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; $outfile = "interpro." . parent::getParameterValue('output_format'); parent::setWriteFile($odir . $outfile, $gz); echo "Parsing interpro xml file" . PHP_EOL; $this->parse($xml); parent::writeRDFBufferToWriteFile(); parent::getWriteFile()->close(); echo "Done!" . PHP_EOL; // let's make an nq file parent::setGraphURI(parent::getDatasetURI()); // dataset description $source_version = parent::getDatasetVersion(); $source_file = (new DataResource($this))->setURI($rfile)->setTitle("InterPro v{$source_version}")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat("application/xml")->setFormat("application/g-zip")->setPublisher("http://www.ebi.ac.uk/")->setHomepage("http://www.ebi.ac.uk/interpro/")->setRights("InterPro - Integrated Resource Of Protein Domains And Functional Sites. Copyright (C) 2001 The InterPro Consortium")->setLicense("http://www.ebi.ac.uk/interpro/faqs.html")->setDataset("http://identifiers.org/interpro/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} v{$source_version}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/interpro/interpro.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description = $source_file->toRDF() . $output_file->toRDF(); parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); return true; }
/** parse directory of files */ function parse_dir() { $ignore = array("..", '.', '.DS_STORE', "0"); $this->setCheckPoint('dataset'); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $dataset_file = parent::getParameterValue("outdir") . parent::getBio2RDFReleaseFile(); $fp = fopen($dataset_file, "w"); if ($fp === FALSE) { trigger_error("Unable to open {$dataset_file}", E_USER_ERROR); return false; } $ids = explode(",", parent::getParameterValue('id_list')); $indir = parent::getParameterValue('indir'); echo "Processing {$indir}\n"; $outfile = "clinicaltrials." . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; parent::setWriteFile(parent::getParameterValue("outdir") . $outfile, $gz); $files = glob($indir . "NCT*"); foreach ($files as $i => $file) { if ($i % 10000 == 0) { parent::clear(); } $trial_id = basename($file, '.xml'); if (parent::getParameterValue('id_list') == '' || in_array($trial_id, $ids)) { if (filesize($file) != 0) { echo "Processing {$trial_id}" . PHP_EOL; $this->process_file($file); } else { echo "Processing {$trial_id} -> Empty!" . PHP_EOL; } } } echo "Finished." . PHP_EOL; parent::getWriteFile()->close(); // make the dataset description parent::setGraphURI(parent::getDatasetURI()); $rfile = "http://clinicaltrials.gov/ct2/show/NCT_ID?resultsxml=true"; $source_version = parent::getDatasetVersion(); // dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Clinicaltrials")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($file)))->setFormat("application/xml")->setPublisher("http://clinicaltrials.gov/")->setHomepage("http://clinicaltrials.gov/")->setRights("use")->setRights("by-attribution")->setLicense("http://clinicaltrials.gov/ct2/about-site/terms-conditions")->setDataset("http://identifiers.org/clinicaltrials/"); parent::writeToReleaseFile($source_file->toRDF()); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} v{$source_version}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/clinicaltrials/clinicaltrials.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } parent::writeToReleaseFile($output_file->toRDF()); parent::closeReleaseFile(); // write the dataset description file fclose($fp); }
function Run() { // directory shortcuts $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); // get the listings page $url = trim(parent::getParameterValue('download_url')); $listing_file = $ldir . "probeset_list.html"; if (!file_exists($listing_file) || parent::getParameterValue("download") == "true") { echo "Downloading {$listing_file}" . PHP_EOL; Utils::DownloadSingle($url, $listing_file); } $listings = file_get_contents($listing_file); // make a list of the csv.zip files preg_match_all("/\"([^\"]+)\\.csv\\.zip\"/", $listings, $m); if (count($m[1]) == 0) { trigger_error("could not find any .csv.zip files in {$url}"); exit; } if (parent::getParameterValue("files") == 'all') { $myfiles = $m[1]; } else { $a = explode(",", parent::getParameterValue("files")); foreach ($a as $f) { $found = false; foreach ($m[1] as $n) { if (strstr($n, $f)) { $found = true; $myfiles[] = $n; break; } } if ($found === false) { echo "cannot find {$f} in list" . PHP_EOL; } } } if (!isset($myfiles)) { exit; } // nothing to do $dataset_description = ''; // set the write file $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; $outfile = 'affymetrix.' . parent::getParameterValue('output_format'); $this->setWriteFile($odir . $outfile, $gz); // iterate over the files foreach ($myfiles as $rfile) { $base_file = substr($rfile, strrpos($rfile, "/") + 1); $base_url = substr($rfile, 0, strrpos($rfile, "/")); // get and set the dataset version if (parent::getDatasetVersion() == null) { preg_match("/\\.na([0-9]{2})\\.annot/", $base_file, $m); if (isset($m[1])) { $this->setDatasetVersion($m[1]); } } if (parent::getDatasetVersion() != parent::getParameterValue('version')) { $base_file = str_replace("na" . parent::getDatasetVersion(), "na" . parent::getParameterValue('version'), $base_file); } $csv_file = $base_file . ".csv"; $zip_file = $csv_file . ".zip"; $lfile = $ldir . $zip_file; if (!file_exists($lfile)) { echo "skipping: {$lfile} does not exist" . PHP_EOL; continue; } echo "processing {$lfile}" . PHP_EOL; // open the zip file $zin = new ZipArchive(); if ($zin->open($lfile) === FALSE) { trigger_error("Unable to open {$lfile}"); exit; } if (($fp = $zin->getStream($csv_file)) === FALSE) { trigger_error("Unable to get {$csv_file} in ziparchive {$lfile}"); return FALSE; } parent::setReadFile($lfile); parent::getReadFile()->setFilePointer($fp); $this->parse($base_file); parent::getReadFile()->close(); parent::clear(); // dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Affymetrix Probeset: {$base_file}")->setRetrievedDate(parent::getDate(filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/zip")->setPublisher("http://affymetrix.com")->setHomepage("http://www.affymetrix.com/support/technical/annotationfilesmain.affx")->setRights("use")->setRights("no-commercial")->setRights("registration-required")->setLicense("http://www.affymetrix.com/about_affymetrix/legal/index.affx")->setDataset("http://identifiers.org/affy.probeset/"); $dataset_description .= $source_file->toRDF(); } $this->getWriteFile()->close(); // write the dataset description $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = parent::getDate(filemtime($odir . $outfile)); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/affymetrix/affymetrix.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $output_file->toRDF(); // write the dataset description $this->setWriteFile($odir . $this->getBio2RDFReleaseFile()); $this->getWriteFile()->write($dataset_description); $this->getWriteFile()->close(); return true; }
function Run() { // directory shortcuts $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); if (parent::getParameterValue('omim_api_key') == '') { $key_file = parent::getParameterValue('omim_api_key_file'); if (file_exists($key_file)) { $key = trim(file_get_contents($key_file)); if ($key) { parent::setParameterValue('omim_api_key', $key); } else { trigger_error("No API key found in the specified omim key file {$key_file}", E_USER_WARNING); } } else { trigger_error("No OMIM key has been provided either by commmand line or in the expected omim key file {$key_file}", E_USER_WARNING); } } // get the list of mim2gene entries $entries = $this->GetListOfEntries($ldir); // get the work specified $list = trim(parent::getParameterValue('files')); if ($list != 'all') { // check if a hyphenated list was provided if (($pos = strpos($list, "-")) !== FALSE) { $start_range = substr($list, 0, $pos); $end_range = substr($list, $pos + 1); // get the whole list $full_list = $this->GetListOfEntries($ldir); // now intersect foreach ($full_list as $e => $type) { if ($e >= $start_range && $e <= $end_range) { $myentries[$e] = $type; } } $entries = $myentries; } else { // for comma separated list $b = explode(",", parent::getParameterValue('files')); foreach ($b as $e) { $myentries[$e] = ''; } $entries = array_intersect_key($entries, $myentries); } } // set the write file $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; $outfile = 'omim.' . parent::getParameterValue('output_format'); parent::setWriteFile($odir . $outfile, $gz); // declare the mapping method types $this->get_method_type(null, true); // iterate over the entries $i = 0; $total = count($entries); foreach ($entries as $omim_id => $type) { echo "processing " . ++$i . " of {$total} - omim# "; $download_file = $ldir . $omim_id . ".json.gz"; $gzfile = "compress.zlib://{$download_file}"; // download if the file doesn't exist or we are told to if (!file_exists($download_file) || parent::getParameterValue('download') == true) { // download using the api $url = parent::getParameterValue('omim_api_url') . '&apiKey=' . parent::getParameterValue('omim_api_key') . '&mimNumber=' . $omim_id; $buf = file_get_contents($url); if (strlen($buf) != 0) { file_put_contents($download_file, $buf); usleep(500000); // limit of 4 requests per second } } // load entry, parse and write to file $entry = json_decode(file_get_contents($gzfile), true); $omim_id = trim((string) $entry["omim"]["entryList"][0]["entry"]['mimNumber']); echo $omim_id; $this->ParseEntry($entry, $type); parent::writeRDFBufferToWriteFile(); echo PHP_EOL; } parent::writeRDFBufferToWriteFile(); parent::getWriteFile()->close(); // generate the dataset description file $source_file = (new DataResource($this))->setURI(parent::getParameterValue('omim_api_url'))->setTitle("OMIM " . parent::getDatasetVersion())->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z"))->setFormat("application/json")->setPublisher("http://omim.org")->setHomepage("http://omim.org")->setRights("use")->setRights("no-commercial")->setRights("registration-required")->setLicense("http://www.omim.org/help/agreement")->setDataset("http://identifiers.org/omim/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/omim/omim.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description = $source_file->toRDF() . $output_file->toRDF(); parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); return TRUE; }
function Run() { $indir = parent::getParameterValue('indir'); $outdir = parent::getParameterValue('outdir'); $download_url = parent::getParameterValue('download_url'); if (parent::getParameterValue('files') == 'all') { $files = explode("|", parent::getParameterList('files')); array_shift($files); } else { $files = explode("|", parent::getParameterValue('files')); } if (parent::getParameterValue("id_list")) { $this->id_list = array_flip(explode(",", parent::getParameterValue('id_list'))); } $dataset_description = ''; foreach ($files as $f) { if ($f == 'drugbank') { $file = 'drugbank.xml.zip'; $lname = 'drugbank'; } $fnx = 'parse_' . $f; $rfile = parent::getParameterValue('download_url') . $file; $lfile = parent::getParameterValue('indir') . $file; $cfile = $lname . "." . parent::getParameterValue('output_format'); // download if (!file_exists($lfile) || parent::getParameterValue('download') == true) { utils::downloadSingle($rfile, $lfile); } // setup the write $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; parent::setWriteFile($outdir . $cfile, $gz); echo $outdir . $cfile; if (file_exists($indir . $file)) { // call the parser echo "processing {$file} ..." . PHP_EOL; $this->{$fnx}($indir, $file); echo "done" . PHP_EOL; parent::clear(); } parent::getWriteFile()->close(); // dataset description $ouri = parent::getGraphURI(); parent::setGraphURI(parent::getDatasetURI()); $source_version = parent::getDatasetVersion(); $bVersion = parent::getParameterValue('bio2rdf_release'); $prefix = parent::getPrefix(); $date = date("Y-m-d\\TH:i:sP"); // dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("DrugBank ({$file})")->setRetrievedDate(date("Y-m-d\\TH:i:sP", filemtime($indir . $file)))->setFormat("application/xml")->setFormat("application/zip")->setPublisher("http://drugbank.ca")->setHomepage("http://drugbank.ca")->setRights("use")->setRights("by-attribution")->setRights("no-commercial")->setLicense("http://www.drugbank.ca/about")->setDataset("http://identifiers.org/drugbank/"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$cfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} v{$source_version}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/drugbank/drugbank.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } parent::writeToReleaseFile($source_file->toRDF() . $output_file->toRDF()); parent::setGraphURI($ouri); } parent::closeReleaseFile(); }
function run() { $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); // get the snps from pharmgkb $snps = explode(",", parent::getParameterValue('files')); if ($snps[0] == 'all') { $snps = $this->getSNPs(); } else { if ($snps[0] == 'clinical') { $snps = $this->getSNPs(true); } else { if ($snps[0] == 'omim') { $lfile = $ldir . 'snp_omimvar.txt'; if (!file_exists($lfile) || parent::getParameterValue('download') == true) { $ret = utils::DownloadSingle('ftp://ftp.ncbi.nlm.nih.gov/snp/Entrez/snp_omimvar.txt', $lfile); } $snps = $this->processOMIMVar($lfile); } else { if ($snps[0] == 'pharmgkb') { $lfile = $ldir . 'pharmgkb.snp.zip'; if (!file_exists($lfile) || parent::getParameterValue('download') == true) { $ret = utils::DownloadSingle('http://www.pharmgkb.org/download.do?objId=rsid.zip&dlCls=common', $lfile); } $snps = $this->processPharmGKBSnps($lfile); } } } } $outfile = $odir . "dbsnp." . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; parent::setWriteFile($outfile, $gz); $n = count($snps); $z = 0; foreach ($snps as $i => $snp) { $file = $snp . '.xml.gz'; $infile = $ldir . $file; $rfile = parent::getParameterValue('download_url') . $snp; //$outfile = $odir.$snp.".".parent::getParameterValue('output_format'); // check if exists $download = false; if (!file_exists($infile)) { //trigger_error($lfile." not found. Will attempt to download. ", E_USER_NOTICE); parent::setParameterValue('download', true); } // download if (parent::getParameterValue('download') == true) { trigger_error("Downloading {$file}", E_USER_NOTICE); $ret = utils::downloadSingle($rfile, "compress.zlib://" . $infile, true); if ($ret === false) { continue; } } // process echo "Processing {$snp} (" . ($i + 1) . "/{$n})" . PHP_EOL; $this->parse($infile); parent::writeRDFBufferToWriteFile(); if ($z++ % 10000 == 0) { parent::clear(); } } parent::getWriteFile()->close(); // generate the dataset description file $source_file = (new DataResource($this))->setURI($rfile)->setTitle("dbSNP " . parent::getDatasetVersion())->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z"))->setFormat("application/xml")->setPublisher("http://www.ncbi.nlm.nih.gov")->setHomepage("http://www.ncbi.nlm.nih.gov/SNP/")->setRights("use-share-modify")->setLicense("http://www.ncbi.nlm.nih.gov/About/disclaimer.html")->setDataset("http://identifiers.org/dbsnp/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$outfile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/dbsnp/dbsnp.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description = $source_file->toRDF() . $output_file->toRDF(); parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); }