function Run() { $idir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $files = parent::getParameterValue('files'); if ($files == 'all') { $list = explode('|', parent::getParameterList('files')); array_shift($list); } else { $list = explode(',', parent::getParameterValue('files')); } $dataset_description = ''; foreach ($list as $item) { $lfile = $idir . $item . '.rpt'; $rfile = parent::getParameterValue('download_url') . $item . '.rpt'; if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') { echo "downloading {$item}..."; $ret = Utils::DownloadSingle($rfile, $lfile); if ($ret != true) { continue; } } parent::setReadFile($lfile, true); echo "Processing {$item}..."; $ofile = $odir . $item . '.' . parent::getParameterValue('output_format'); $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false; parent::setWriteFile($ofile, $gz); $this->{$item}(); parent::getWriteFile()->close(); parent::getReadFile()->close(); echo "Done" . PHP_EOL; parent::clear(); $source_file = (new DataResource($this))->setURI($rfile)->setTitle("MGI {$item}")->setRetrievedDate(date("Y-m-d\\TH:i:s", filemtime($lfile)))->setFormat("text")->setPublisher("http://www.informatics.jax.org")->setHomepage("http://www.informatics.jax.org")->setRights("use")->setLicense("http://www.informatics.jax.org/mgihome/other/copyright.shtml")->setDataset("http://identifiers.org/mgi/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TH:i:s"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$item} in {$prefix}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/mgi/mgi.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $source_file->toRDF() . $output_file->toRDF(); } //foreach // generate the dataset release file $this->setWriteFile($odir . parent::getBio2RDFReleaseFile()); $this->getWriteFile()->write($dataset_description); $this->getWriteFile()->close(); echo "done!" . PHP_EOL; }
function run() { $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $dd = ''; $files = parent::getParameterValue('files'); if ($files == 'all') { $files = explode('|', parent::getParameterList('files')); array_shift($files); } else { $files = explode(',', parent::getParameterValue('files')); } foreach ($files as $file) { echo "processing {$file} ..."; $lfile = $ldir . $this->filemap[$file]; $rfile = parent::getParameterValue('download_url') . $this->filemap[$file]; if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') { $ret = utils::downloadSingle($rfile, $lfile); if ($ret === false) { echo "unable to download {$file} ... skipping" . PHP_EOL; continue; } } parent::setReadFile($lfile, true); $suffix = parent::getParameterValue('output_format'); $ofile = "orphanet-" . $file . '.' . $suffix; $gz = strstr(parent::getParameterValue('output_format'), "gz") ? $gz = true : ($gz = false); /* parent::setWriteFile($odir.$ofile, $gz); $this->$file($lfile); parent::getWriteFile()->close(); */ parent::getReadFile()->close(); parent::clear(); echo "done!" . PHP_EOL; // dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("Orphanet: {$file}")->setRetrievedDate(parent::getDate(filemtime($lfile)))->setFormat("application/xml")->setPublisher("http://www.orpha.net")->setHomepage("http://www.orpha.net/")->setRights("use")->setRights("sharing-modified-version-needs-permission")->setLicense("http://creativecommons.org/licenses/by-nd/3.0/")->setDataset("http://identifiers.org/orphanet/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = parent::getDate(filemtime($odir . $ofile)); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/orphanet/orphanet.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); $gz = strstr(parent::getParameterValue('output_format'), ".gz") === FALSE ? false : true; if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dd .= $source_file->toRDF() . $output_file->toRDF(); } //foreach parent::writeToReleaseFile($dd); }
function parse() { // convert into json $lfile = parent::getReadFile()->getFileName(); $xml = simplexml_load_file($lfile); $json = json_encode($xml); $db = json_decode($json, TRUE); // miriam metadata // $attributes = $db['@attributes']; foreach ($db['datatype'] as $item) { $this->parseItem($item); } }
function gene_interactions() { while ($l = parent::getReadFile()->Read()) { if ($l[0] == '#') { continue; } $data = explode("\t", $l); if (count($data) != 11) { trigger_error("Found " . count($data) . " columns, expecting 11"); continue; } $interaction = $data[0]; $interaction_type = str_replace("_", "-", $data[1]); $interaction_type_label = str_replace("_", " ", $data[1]); $int_additional_info = $data[2]; $gene1 = $data[5]; $gene2 = $data[8]; $interaction_id = parent::getNamespace() . $interaction; if ($interaction_type == "Genetic") { $int_pred = parent::getVoc() . "genetically-interacts-with"; } elseif ($interaction_type == "Physical") { $int_pred = parent::getVoc() . "physically-interacts-with"; } elseif ($interaction_type == "Predicted") { $int_pred = parent::getVoc() . "predicted-to-interact-with"; } elseif ($interaction_type == "Regulatory") { $int_pred = parent::getVoc() . "regulates"; } //elseif if ($int_additional_info == "No_interaction") { $interaction_label = "No " . strtolower($interaction_type) . " interaction between " . $gene1 . " and " . $gene2; parent::addRDF(parent::describeIndividual($interaction_id, $interaction_label, parent::getVoc() . $interaction_type . "-Non-Interaction") . parent::describeClass(parent::getVoc() . $interaction_type . "-Non-Interaction", $interaction_type_label . " non-interaction") . parent::triplify($interaction_id, parent::getVoc() . "involves", parent::getNamespace() . $gene1) . parent::triplify($interaction_id, parent::getVoc() . "involves", parent::getNamespace() . $gene2)); $npa_id = parent::getRes() . md5($interaction_id . "negative property assertion"); $npa_label = "Negative property assertion stating that " . $gene1 . " and " . $gene2 . " do not have a " . $interaction_type_label . " interaction"; parent::addRDF(parent::describeIndividual($npa_id, $npa_label, "owl:NegativeObjectPropertyAssertion") . parent::triplify($npa_id, "owl:sourceIndividual", parent::getNamespace() . $gene1) . parent::triplify($npa_id, "owl:targetIndividual", parent::getNamespace() . $gene2) . parent::triplify($npa_id, "owl:assertionProperty", $int_pred)); } elseif ($int_additional_info == "N/A" || $int_additional_info == "Genetic_interaction") { $interaction_label = $interaction_type . " interaction between " . $gene1 . " and " . $gene2; parent::addRDF(parent::describeIndividual($interaction_id, $interaction_label, parent::getVoc() . $interaction_type . "-Interaction") . parent::describeClass(parent::getVoc() . $interaction_type . "-Interaction", $interaction_type_label . " Interaction") . parent::triplify($interaction_id, parent::getVoc() . "involves", parent::getNamespace() . $gene1) . parent::triplify($interaction_id, parent::getVoc() . "involves", parent::getNamespace() . $gene2) . parent::triplify(parent::getNamespace() . $gene1, $int_pred, parent::getNamespace() . $gene2)); } else { $interaction_label = ($int_additional_info != "" ? $int_additional_info . " " : "") . strtolower($interaction_type) . " interaction between " . $gene1 . " and " . $gene2; $type = parent::getVoc() . ($int_additional_info != "" ? $int_additional_info . "-" : "") . $interaction_type . "-Interaction"; $type_label = ($int_additional_info != "" ? $int_additional_info . " " : "") . $interaction_type_label . " Interaction"; parent::addRDF(parent::describeIndividual($interaction_id, $interaction_label, $type) . parent::describeClass($type, $type_label, parent::getVoc() . $interaction_type . "-Interaction") . parent::describeClass(parent::getVoc() . $interaction_type . "-Interaction", $interaction_type . " Interation") . parent::triplify($interaction_id, parent::getVoc() . "involves", parent::getNamespace() . $gene1) . parent::triplify($interaction_id, parent::getVoc() . "involves", parent::getNamespace() . $gene2) . parent::triplify(parent::getNamespace() . $gene1, $int_pred, parent::getNamespace() . $gene2)); } //else parent::WriteRDFBufferToWriteFile(); } //while }
function process() { $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $rdir = parent::getParameterValue('download_url'); //which files are to be converted? $files = trim($this->GetParameterValue('files')); if ($files == 'all') { $files = $this->getPackageMap(); } else { $sel_arr = explode(",", $files); $pm = $this->getPackageMap(); $files = array(); foreach ($sel_arr as $a) { if (array_key_exists($a, $pm)) { $files[$a] = $pm[$a]; } } } if ($this->getParameterValue('limit_organisms') == true) { $this->taxids = array_flip(explode(",", $this->getParameterValue('organisms'))); } //set dataset graph to be dataset URI $graph_uri = parent::getGraphURI(); if (parent::getParameterValue('dataset_graph') == true) { parent::setGraphURI(parent::getDatasetURI()); } $dataset_description = ''; //now iterate over the files array foreach ($files as $module => $rfilename) { $file = $module . ".gz"; $lfile = $ldir . $file; $rfile = $rdir . $rfilename; // download if (!file_exists($lfile) || parent::getParameterValue('download') == true) { trigger_error("{$lfile} not found. Will attempt to download.", E_USER_NOTICE); $myfile = $lfile; if ($module == "gene2sts" || $module == "gene2unigene") { $myfile = "compress.zlib://" . $lfile; } echo "downloading {$module} ..."; utils::DownloadSingle($rfile, $myfile); echo "done" . PHP_EOL; } } foreach ($files as $module => $rfilename) { $file = $module . ".gz"; $lfile = $ldir . $file; $rfile = $rdir . $rfilename; $ofile = $module . "." . parent::getParameterValue('output_format'); $gz = false; if (strstr(parent::getParameterValue('output_format'), "gz")) { $gz = true; } echo "Processing {$module} ... "; parent::setReadFile($lfile, true); parent::setWriteFile($odir . $ofile, $gz); $fnx = $module; if ($module == 'gene2refseq') { $fnx = 'gene2accession'; } $this->{$fnx}(); parent::clear(); echo 'done!' . PHP_EOL; parent::getReadFile()->close(); parent::getWriteFile()->close(); // generate the dataset release file // dataset description $source_file = (new DataResource($this))->setURI($rfile)->setTitle("NCBI Gene ({$module})")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($lfile)))->setFormat("text/tab-separated-value")->setFormat("application/gzip")->setPublisher("http://www.ncbi.nlm.nih.gov")->setHomepage("http://www.ncbi.nlm.nih.gov/gene")->setRights("use-share-modify")->setLicense("http://www.ncbi.nlm.nih.gov/About/disclaimer.html")->setDataset("http://identifiers.org/ncbigene/"); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$ofile}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/ncbigene/ncbigene.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $source_file->toRDF() . $output_file->toRDF(); } //foreach //set graph URI back to default value parent::setGraphURI($graph_uri); //write dataset description to file echo "Generating dataset description... "; parent::setWriteFile($odir . parent::getBio2RDFReleaseFile()); parent::getWriteFile()->write($dataset_description); parent::getWriteFile()->close(); echo "done!" . PHP_EOL; }
function genes() { $h = explode("\t", parent::getReadFile()->read()); $expected_columns = 14; if (($n = count($h)) != $expected_columns) { trigger_error("Found {$n} columns in gene file - expecting {$expected_columns}!", E_USER_WARNING); return false; } while ($l = parent::getReadFile()->read(200000)) { $a = explode("\t", $l); $id = parent::getNamespace() . $a[0]; $label = $a[3]; $this->genes[$a[0]] = $a[3]; parent::addRDF(parent::describeIndividual($id, $label, parent::getVoc() . "Gene") . parent::describeClass(parent::getVoc() . "Gene", "PharmGKB Gene")); // link data parent::addRDF(parent::triplify($id, "rdfs:seeAlso", "http://pharmgkb.org/gene/" . $a[0]) . parent::triplify($id, "rdfs:seeAlso", "http://www4.wiwiss.fu-berlin.de/diseasome/resource/genes/" . $a[0]) . parent::triplify($id, "rdfs:seeAlso", "http://dbpedia.org/resource/" . $a[0])); if ($a[1]) { parent::addRDF(parent::triplify($id, parent::getVoc() . "x-ncbigene", "ncbigene:" . $a[1])); } if ($a[2]) { parent::addRDF(parent::triplify($id, parent::getVoc() . "x-ensembl", "ensembl:" . $a[2])); } if ($a[3]) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "name", $a[3]) . parent::describeProperty(parent::getVoc() . "name", "Relationship between a PharmGKB entity and its name")); } if ($a[4]) { parent::addRDF(parent::triplify($id, parent::getVoc() . "symbol", "symbol:" . $a[4]) . parent::describeProperty(parent::getVoc() . "symbol", "Relationship between a PharmGKB gene and a gene symbol")); } if ($a[5]) { $b = explode('","', substr($a[5], 1, -2)); foreach ($b as $alt_name) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "alternative-name", parent::safeLiteral(trim(stripslashes($alt_name))))); } parent::addRDF(parent::describeProperty(parent::getVoc() . "alternative-name", "Relationship between a PharmGKB gene and an alternative name")); } if ($a[6]) { // these are not hgnc symbols $b = explode('","', substr($a[6], 1, -2)); foreach ($b as $alt_symbol) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "alternate-symbol", trim($alt_symbol))); } parent::addRDF(parent::describeProperty($id, parent::getVoc() . "alternate-symbol", "Relationship between a PharmGKB gene and an alternate gene symbol")); } if ($a[7]) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "is-vip", $a[7]) . parent::describeProperty(parent::getVoc() . "is-vip", "Relationship between a PharmGKB gene and its vip status")); } if ($a[8]) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "has-variant-annotation", $a[8]) . parent::describeProperty(parent::getVoc() . "has-variant-annotation", "Relationship between a PharmGKB gene and whether it has a variant annotation")); } if ($a[9]) { $b = explode(",", $a[9]); foreach ($b as $xref) { $xref = trim($xref); if (!$xref) { continue; } $url = false; $x = $this->MapXrefs($xref, $url, $ns, $id2); $ns = str_replace(' ', '', $ns); if ($url == true) { parent::addRDF(parent::QQuadO_URL($id, parent::getVoc() . "x-{$ns}", $x)); } else { parent::addRDF(parent::triplify($id, parent::getVoc() . "x-{$ns}", $x)); } } } if ($a[10]) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "cpic-dosing-guideline", $a[10])); } if ($a[11]) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "chromosome", $a[11]) . parent::describeProperty(parent::getVoc() . "chrosomome", "Relationship between a PharmGKB gene and its chromosomal position") . parent::triplifyString($id, parent::getVoc() . "chromosome-start", $a[12]) . parent::triplifyString($id, parent::getVoc() . "chromosome-end", $a[13])); } parent::WriteRDFBufferToWriteFile(); } }
function Parse($file) { parent::getReadFile()->read(); // skip the first comment line $line = 1; $first = true; while ($l = parent::getReadFile()->read(500000)) { if ($l[0] == "#") { // dataset attributes $a = explode('=', trim($l)); $r = $this->getVoc() . substr($a[0], 2); if (isset($a[1])) { $v = $a[1]; if ($r == "affymetrix_vocabulary:genome-version-create_date") { $x = explode("-", $a[1]); if ($x[2] == "00") { $x[2] = "01"; } $v = implode("-", $x); } parent::addRDF(parent::triplifyString(parent::getDatasetURI(), $r, $v) . parent::describe($r, "{$r}")); } continue; } if ($first == true) { $first = false; // header $header = explode(",", str_replace('"', '', trim($l))); // print_r($header);exit; $n = count($header); if ($n != 41) { trigger_error("Expecting 41 columns, found {$n} in header on line {$line}!", E_USER_ERROR); exit; } continue; } $a = explode('","', substr($l, 1, -2)); $n = count($a); if ($n != 41) { trigger_error("Expecting 41 columns, found {$n} on line {$line}!", E_USER_ERROR); exit; } parent::writeRDFBufferToWriteFile(); $id = $a[0]; $qname = "affymetrix:{$id}"; $label = "probeset {$a['0']} on GeneChip {$a['1']} ({$a['2']})"; parent::addRDF(parent::describeIndividual($qname, $label, $this->getVoc() . "Probeset") . parent::describeClass($this->getVoc() . "Probeset", "Affymetrix probeset")); trigger_error($id, E_USER_NOTICE); // now process the entries foreach ($a as $k => $v) { if (trim($v) == '---') { continue; } // multi-valued entries are separated by //// $b = explode(" /// ", $v); $r = $this->Map($k); if (isset($r)) { foreach ($b as $c) { $d = explode(" // ", $c); if ($r == 'symbol') { $d[0] = str_replace(" ", "-", $d[0]); } $s = $this->getRegistry()->getPreferredPrefix($r); if ($s == "ec") { $e = explode(":", $d[0]); $d[0] = $e[1]; } $this->addRDF(parent::triplify($qname, $this->getVoc() . "x-{$s}", "{$s}:" . $d[0]) . parent::describeProperty($this->getVoc() . "x-{$s}", "a relation to {$s}")); } } else { // we handle manually unset($rel); $label = $header[$k]; switch ($label) { case 'GeneChip Array': $array_id = parent::getRes() . str_replace(" ", "-", $v); parent::addRDF(parent::triplify($qname, $this->getVoc() . "genechip-array", $array_id) . parent::describeIndividual($array_id, "Affymetrix {$v} GeneChip array", $this->getVoc() . "Genechip-Array") . parent::describeClass($this->getVoc() . "Genechip-Array", "Affymetrix GeneChip array")); break; case 'Gene Ontology Biological Process': if (!isset($rel)) { $rel = 'go-process'; $prefix = "go"; } case 'Gene Ontology Cellular Component': if (!isset($rel)) { $rel = 'go-location'; $prefix = "go"; } case 'Gene Ontology Molecular Function': if (!isset($rel)) { $rel = 'go-function'; $prefix = "go"; } $b = explode(" /// ", $v); foreach ($b as $c) { $d = explode(" // ", $c); parent::addRDF($this->triplify($qname, $this->getVoc() . $rel, "{$prefix}:" . $d[0]) . $this->describeProperty($this->getVoc() . $rel, "{$rel}")); } break; case 'Transcript Assignments': $b = explode(" /// ", $v); foreach ($b as $c) { $d = explode(" // ", $c); $id = $d[0]; $prefix = $d[2]; if ($prefix == '---' || $id == '---') { continue; } else { if ($prefix == 'gb' || $prefix == 'gb_htc') { $prefix = 'genbank'; } else { if ($prefix == 'ncbibacterial') { $prefix = 'gi'; } else { if ($prefix == 'ncbi_bacterial') { $prefix = 'gi'; } else { if ($prefix == 'ens') { $prefix = 'ensembl'; } else { if ($prefix == 'ncbi_mito' || $prefix == 'ncbi_organelle' || $prefix == 'organelle') { $prefix = 'refseq'; } else { if ($prefix == 'affx' || $prefix == 'unknown' || $prefix == "prop") { $prefix = 'affymetrix'; } else { if ($prefix == 'tigr_2004_08') { $prefix = 'tigr'; } else { if ($prefix == 'tigr-plantta') { $prefix = 'genbank'; } else { if ($prefix == 'newrs.gi') { $prefix = 'gi'; } else { if ($prefix == 'newRS.gi') { $prefix = 'gi'; } else { if ($prefix == 'primate_viral') { $prefix = 'genbank'; } else { if ($prefix == 'jgi-bacterial') { $prefix = 'ncbigene'; } else { if ($prefix == 'tb') { $prefix = 'tuberculist'; } else { if ($prefix == 'pa') { $prefix = 'pseudomonas'; } else { if ($prefix == 'gi|53267') { $prefix = 'gi'; $id = '53267'; } else { if ($prefix == 'broad-tcup') { $e = explode("-", $id); $id = $e[0]; } else { if ($prefix == 'organelle') { $e = explode("-", $id); $prefix = 'genbank'; $id = $e[0]; } } } } } } } } } } } } } } } } } } parent::addRDF(parent::triplify($qname, $this->getVoc() . "transcript-assignment", "{$prefix}:{$id}") . parent::describeProperty($this->getVoc() . "transcript-assignment", "transcript assignment")); } break; case 'Annotation Transcript Cluster': /* $id = substr($v,0,strpos($v,"(")); $rel = str_replace(" ","-",strtolower($label)); $this->AddRDF($this->triplify($qname,parent::getVoc()."$rel", "refseq:$id")); */ break; case 'Annotation Date': // Jun 9, 2011 $rel = "annotation-date"; preg_match("/^([A-Za-z]+) ([0-9]+), ([0-9]{4})\$/", $v, $m); if (count($m) == 4) { array_shift($m); list($m, $day, $year) = $m; $month = $this->getMonth($m); if (!$day || $day == "0") { $day = "01"; } $date = $year . "-" . $month . "-" . str_pad($day, 2, "0", STR_PAD_LEFT) . "T00:00:00Z"; parent::addRDF(parent::triplifyString($qname, $this->getVoc() . $rel, $date, "xsd:dateTime") . parent::describeProperty($this->getVoc() . $rel, "{$rel}")); } else { trigger_error("could not match date from {$v}", E_USER_ERROR); } break; case 'Species Scientific Name': break; case 'Transcript ID(Array Design)': if (!isset($rel)) { $rel = 'transcript'; } case 'Sequence type': default: if (!isset($rel)) { $rel = str_replace(" ", "-", strtolower($label)); } $b = explode(" /// ", $v); foreach ($b as $c) { parent::addRDF(parent::triplifyString($qname, $this->getVoc() . $rel, stripslashes($c)) . parent::describeProperty($this->getVoc() . $rel, "{$rel}")); } break; } // switch } // else } $this->WriteRDFBufferToWriteFile(); } }
function models() { $tax_ids = array("Caenorhabditis elegans" => "6239", "Mus musculus" => "10090", "Saccharomyces cerevisiae" => "4932", "Drosophila melanogaster" => "7227", "Podospora anserina" => "5145", "Mesocricetus auratus" => "10036", "Schizosaccharomyces pombe" => "4896", "Danio rerio" => "7955"); $h = explode(",", parent::getReadFile()->read()); $expected_columns = 8; if (($n = count($h)) != $expected_columns) { trigger_error("Found {$n} columns in gene file - expecting {$expected_columns}!", E_USER_WARNING); return false; } /* [0] GenAge ID [1] symbol [2] name [3] organism [4] entrez gene id [5] avg lifespan change (max obsv) [6] lifespan effect [7] longevity influence */ while ($l = parent::getReadFile()->read(200000)) { $data = str_getcsv($l); $genage = str_pad($data[0], 4, "0", STR_PAD_LEFT); $gene_symbol = $data[1]; $name = $data[2]; $organism = $data[3]; $ncbi_gene_id = $data[4]; $max_percent_obsv_avg_lifespan_change = $data[5]; $lifespan_effect = $data[6]; $longevity_influence = $data[7]; $genage_id = parent::getNamespace() . $genage; parent::addRDF(parent::describeIndividual($genage_id, $name, parent::getVoc() . "Aging-Related-Gene") . parent::describeClass(parent::getVoc() . "Aging-Related-Gene", "Aging Related Gene")); parent::addRDF(parent::triplifyString($genage_id, parent::getVoc() . "gene-symbol", parent::safeLiteral($gene_symbol))); parent::addRDF(parent::triplify($genage_id, parent::getVoc() . "taxon", "ncbitaxon:" . $tax_ids[$organism])); if ($ncbi_gene_id !== "") { parent::addRDF(parent::triplify($genage_id, parent::getVoc() . "x-ncbigene", "ncbigene:" . $ncbi_gene_id)); } if ($max_percent_obsv_avg_lifespan_change !== "") { parent::addRDF(parent::triplifyString($genage_id, parent::getVoc() . "maximum-percent-observed-average-lifespan-change", parent::safeLiteral($max_percent_obsv_avg_lifespan_change))); } if ($lifespan_effect == "Increase and Decrease") { parent::addRDF(parent::triplifyString($genage_id, parent::getVoc() . "lifespan-effect", "increase") . parent::triplifyString($genage_id, parent::getVoc() . "lifespan-effect", "decrease")); } else { parent::addRDF(parent::triplifyString($genage_id, parent::getVoc() . "lifespan-effect", strtolower($lifespan_effect))); } parent::addRDF(parent::triplifyString($genage_id, parent::getVoc() . "longevity-influence", strtolower($longevity_influence))); parent::WriteRDFBufferToWriteFile(); } }
function process($file) { $z = 1; while ($l = parent::getReadFile()->read(100000)) { if ($z % 100000 == 0) { parent::clear(); } if ($l[0] == "!") { continue; } $fields = explode("\t", $l); if (count($fields) != 17) { trigger_error("Expected 17 columns, but found " . count($fields), E_USER_ERROR); return false; } //get the Go id $db = $fields[0]; $id = $fields[1]; $symbol = $fields[2]; $qualifier = $fields[3]; $goid = substr($fields[4], 3); $refs = $this->getDbReferences($fields[5]); $eco = $this->getEvidenceCodeLabelArr($fields[6]); $aspect = $this->getAspect($fields[8]); $label = $fields[9]; $synonyms = explode("|", $fields[10]); $taxid = $fields[12]; $date = $this->parseDate($fields[13]); $assignedBy = $fields[14]; //entity id $eid = $this->getdbURI($db, $id); if (!$eid) { print_r($fields); continue; } parent::addRDF(parent::describeIndividual($eid, $label, parent::getVoc() . "GO-Annotation") . parent::describeClass(parent::getVoc() . "GO-Annotation", "GO Annotation") . parent::triplifyString($eid, parent::getVoc() . "symbol", $symbol)); parent::addRDF(parent::triplify($eid, parent::getVoc() . "x-taxonomy", $taxid)); foreach ($synonyms as $s) { if (!empty($s)) { parent::addRDF(parent::triplifyString($eid, parent::getVoc() . "synonym", $s)); } } $rel = $aspect; if ($qualifier == 'NOT') { if ($aspect == 'process') { $rel = 'not-in-process'; } if ($aspect == 'function') { $rel = 'not-has-function'; } if ($aspect == 'component') { $rel = 'not-in-component'; } } parent::addRDF(parent::describeObjectProperty(parent::getVoc() . $rel, str_replace("-", " ", $rel)) . parent::triplify($eid, parent::getVoc() . $rel, "go:" . $goid)); $type = key($eco); $aid = parent::getRes() . $file . "_" . $z++; parent::addRDF(parent::describeObjectProperty(parent::getVoc() . "go-annotation", "GO annotation") . parent::triplify($eid, parent::getVoc() . "go-annotation", $aid)); $cat = parent::getRes() . md5($aspect); parent::addRDF(parent::describeIndividual($aid, "{$id}-go:{$goid} association", parent::getVoc() . "GO-Annotation") . parent::triplify($aid, parent::getVoc() . "target", $eid) . parent::triplify($aid, parent::getVoc() . "go-term", "go:" . $goid) . parent::triplify($aid, parent::getVoc() . "evidence", "eco:" . $eco[$type][1]) . parent::triplify($aid, parent::getVoc() . "go-category", $cat) . parent::describeClass($cat, $aspect) . parent::triplifyString($aid, parent::getVoc() . "assigned-by", $assignedBy)); if ($date != '') { parent::addRDF(parent::triplifyString($aid, parent::getVoc() . "entry-date", $date . "T00:00:00Z", "xsd:dateTime")); } foreach ($refs as $ref) { $b = explode(":", $ref); if ($b[0] == 'PMID') { parent::addRDF(parent::triplify($aid, parent::getVoc() . "article", "pubmed:" . $b[1])); } } //write RDF to file parent::writeRDFBufferToWriteFile(); } }
function gene_expression() { $h = explode(",", parent::getReadFile()->read()); $expected_columns = 8; if (($n = count($h)) != $expected_columns) { trigger_error("Found {$n} columns in gene file - expecting {$expected_columns}!", E_USER_WARNING); return false; } while ($l = parent::getReadFile()->read(200000)) { $data = str_getcsv($l); $mgi_symbol = $data[0]; $mgi_description = $data[1]; $geneid = $data[2]; $total_datasets = $data[3]; $total_ovexp = $data[4]; $total_underexp = $data[5]; $p_value = $data[6]; $expression = $data[7]; $id = parent::getRes() . md5($geneid . $total_datasets . $total_ovexp . $total_underexp . $p_value . $expression); $evidence_id = parent::getRes() . md5($geneid . $total_datasets . $total_ovexp . $total_underexp . $p_value . $expression . "_evidence"); $label = "Dietary restriction induced " . $expression . "-expression of " . $mgi_symbol . " based on microarray results from " . $total_datasets . " datasets, with p-value " . $p_value; $type_label = "Gene " . ucfirst($expression) . " Expression"; $type = parent::getVoc() . str_replace(" ", "-", $type_label); parent::addRDF(parent::describeIndividual($id, $label, $type) . parent::describeClass($type, $type_label) . parent::triplify($id, parent::getVoc() . "gene", "ncbigene:" . $geneid) . parent::triplifyString("ncbigene:" . $geneid, parent::getVoc() . "mgi-gene-symbol", $mgi_symbol) . parent::triplifyString("ncbigene:" . $geneid, parent::getVoc() . "mgi-gene-description", $mgi_description) . parent::triplify($id, parent::getVoc() . "evidence", $evidence_id) . parent::triplifyString($id, parent::getVoc() . "perturbation-context", "dietary restriction") . parent::triplifyString($evidence_id, parent::getVoc() . "total-number-datasets", $total_datasets) . parent::triplifyString($evidence_id, parent::getVoc() . "total-number-datasets-overexpressed", $total_ovexp) . parent::triplifyString($evidence_id, parent::getVoc() . "total-number-datasets-underexpressed", $total_underexp) . parent::triplifyString($evidence_id, parent::getVoc() . "p-value", $p_value)); parent::writeRDFBufferToWriteFile(); } //while }
function process($db) { $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); while ($l = parent::getReadFile()->read()) { list($nsid, $name) = explode("\t", $l); list($ns, $id) = explode(":", $nsid); if (isset($this->idlist) and !in_array($id, $this->idlist)) { continue; } if (isset($this->org)) { $id = $ns . "_" . $id; } $uri = $this->getNamespace() . $id; parent::addRDF(parent::describeIndividual($uri, $name, parent::getVoc() . ucfirst($db)) . parent::describeClass(parent::getVoc() . ucfirst($db), "KEGG {$db}") . parent::triplifyString($uri, parent::getVoc() . "internal-id", $nsid)); // now get the entries for each $lfile = $ldir . $id . ".txt"; $rfile = parent::getParameterValue("download_url") . "get/{$nsid}"; if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') { echo "downloading {$nsid} "; $ret = utils::downloadSingle($rfile, $lfile); if ($ret === false) { echo "unable to download " . $nsid . " ... skipping" . PHP_EOL; continue; } echo "done. "; } echo "parsing {$nsid} ... "; $this->parseEntry($lfile); parent::writeRDFBufferToWriteFile(); if ($db === "pathway") { $ko = str_replace("map", "ko", $id); $lfile = $ldir . $id . ".kgml"; $rfile = "http://www.kegg.jp/kegg-bin/download?entry=" . $ko . "&format=kgml"; if (!file_exists($lfile) || parent::getParameterValue('download') == 'true') { echo "downloading KGML for {$nsid} "; $ret = utils::downloadSingle($rfile, $lfile); if ($ret === false) { echo "unable to download " . $nsid . " ... skipping" . PHP_EOL; continue; } echo "done. "; } $this->parseKGML($lfile); parent::writeRDFBufferToWriteFile(); } echo "done!" . PHP_EOL; } }
function Parse() { $l = parent::getReadFile()->read(100000); $header = explode("\t", trim(substr($l, 1))); if (($c = count($header)) != 54) { trigger_erorr("Expecting 54 columns, found {$c}!"); return FALSE; } // check # of columns while ($l = parent::getReadFile()->read(500000)) { $a = explode("\t", trim($l)); // irefindex identifiers $rigid = "irefindex." . $a[34]; # checksum for interaction $rogida = "irefindex." . $a[32]; # checksum for A $rogidb = "irefindex." . $a[33]; # checksum for B $irigid = "irefindex.irigid:" . $a[44]; # integer id for interaction $irogida = "irefindex.irogid:" . $a[42]; # integer id for A $irogidb = "irefindex.irogid:" . $a[43]; # integer id for B $crigid = "irefindex.crigid:" . $a[47]; # checksum for canonical interaction $icrigid = "irefindex.icrigid:" . $a[50]; # integer id for canonical interaction $crogida = "irefindex.crogid:" . $a[45]; # checksum for A's canonical group $crogidb = "irefindex.crogid:" . $a[46]; # checksum for B's canonical group $icrogida = "irefindex.icrogid:" . $a[48]; # integer for A's canonical group $icrogidb = "irefindex.icrogid:" . $a[49]; # integer for B's canonical group // 13 contains the original identifier, the rigid, and the edgetype $ids = explode("|", $a[13]); if (count($ids) != 3) { trigger_error("Expecting 3 entries in column 14"); print_r($ids); exit; } parent::getRegistry()->parseQName($ids[0], $ns, $id); if ($id == '-') { // this happens with hprd $iid = "hprd:" . substr($ids[1], 6); } else { $iid = $ns . ":" . $id; } // get the type if ($a[52] == "X") { $label = "{$a['0']} - {$a['1']} Interaction"; $type = "Pairwise-Interaction"; } else { if ($a[52] == "C") { $label = $a[53] . " component complex"; #num of participants $type = "Multimeric-Complex"; } else { if ($a[52] == "Y") { $label = "{$a['0']} homomeric complex"; $type = "Homopolymeric-Complex"; } } } parent::addRDF(parent::describeIndividual($iid, $label, parent::getVoc() . $type) . parent::describeClass(parent::getVoc() . $type, str_replace("-", " ", $type))); // interaction type[52] by method[6] unset($method); if ($a[6] != '-') { $data = $this->ParseStringArray($a[6]); $method = trim($data["label"]); $qname = trim($data["ns"]) . ":" . trim($data["id"]); if ($qname) { parent::addRDF(parent::triplify($iid, parent::getVoc() . "method", $qname) . parent::describeClass($qname, $data['label'])); } } parent::addRDF(parent::triplify($iid, "rdfs:seeAlso", "http://wodaklab.org/iRefWeb/interaction/show/" . $a[50])); // set the interactors for ($i = 0; $i <= 1; $i++) { $p = 'a'; if ($i == 1) { $p = 'b'; } $data = $this->ParseStringArray($a[$i]); $interactor = trim($data["ns"]) . ":" . trim($data["id"]); parent::addRDF(parent::triplify($iid, parent::getVoc() . "interactor_{$p}", $interactor)); // biological role $role = $a[16 + $i]; if ($role != '-') { $data = $this->ParseStringArray($role); $qname = trim($data["ns"]) . ":" . trim($data["id"]); if ($qname != "mi:0000") { parent::addRDF(parent::triplify($iid, parent::getVoc() . "interactor_{$p}" . "_biological_role", $qname) . parent::describeClass($qname, $data['label'])); } } // experimental role $role = $a[18 + $i]; if ($role != '-') { $data = $this->ParseStringArray($role); $qname = trim($data["ns"]) . ":" . trim($data["id"]); if ($qname != "mi:0000") { parent::addRDF(parent::triplify($iid, parent::getVoc() . "interactor_{$p}" . "_experimental_role", $qname) . parent::describeClass($qname, $data['label'])); } } // interactor type $type = $a[20 + $i]; if ($type != '-') { $data = $this->ParseStringArray($type); $qname = trim($data["ns"]) . ":" . trim($data["id"]); parent::addRDF(parent::triplify($interactor, "rdf:type", $qname) . parent::describeClass($qname, $data['label'])); } } // add the alternatives through the taxon + seq redundant group for ($i = 2; $i <= 3; $i++) { $taxid = ''; $rogid = "irefindex." . $a[32 + ($i - 2)]; parent::addRDF(parent::describeIndividual($rogid, "", parent::getVoc() . "Taxon-Sequence-Identical-Group") . parent::describeClass(parent::getVoc() . "Taxon-Sequence-Identical-Group", "Taxon + Sequence Identical Group")); $tax = $a[9 + ($i - 2)]; if ($tax && $tax != '-' && $tax != '-1') { $data = $this->ParseStringArray($tax); $taxid = trim($data["ns"]) . ":" . trim($data["id"]); parent::addRDF(parent::triplify($rogid, parent::getVoc() . "x-taxonomy", $taxid)); } $list = explode("|", $a[3 + ($i - 2)]); foreach ($list as $item) { $data = $this->ParseStringArray($item); $ns = trim($data["ns"]); $id = trim($data["id"]); $qname = $ns . ":" . $id; if ($ns && $ns != 'rogid' && $ns != 'irogid' and $id != '-') { parent::addRDF(parent::triplify($rogid, parent::getVoc() . "has-member", $qname)); if ($taxid && $taxid != '-' && $taxid != '-1') { parent::addRDF(parent::triplify($qname, parent::getVoc() . "x-taxonomy", $taxid)); } } } } // publications $list = explode("|", $a[8]); foreach ($list as $item) { if ($item == '-' && $item != 'pubmed:0') { continue; } $data = $this->ParseStringArray($item); $qname = trim($data["ns"]) . ":" . trim($data["id"]); parent::addRDF(parent::triplify($iid, parent::getVoc() . "article", $qname)); } // MI interaction type if ($a[11] != '-' && $a[11] != 'NA') { $data = $this->ParseStringArray($a[11]); $qname = trim($data["ns"]) . ":" . trim($data["id"]); parent::addRDF(parent::triplify($iid, "rdf:type", $qname)); if (!isset($defined[$qname])) { $defined[$qname] = ''; parent::addRDF(parent::triplifyString($qname, "rdfs:label", $data['label'])); } } // source if ($a[12] != '-') { $data = $this->ParseStringArray($a[12]); $qname = trim($data["ns"]) . ":" . trim($data["id"]); parent::addRDF(parent::triplify($iid, parent::getVoc() . "source", $qname)); } // confidence $list = explode("|", $a[14]); foreach ($list as $item) { $data = $this->ParseStringArray($item); $ns = trim($data["ns"]); $id = trim($data["id"]); if ($ns == 'lpr') { // lowest number of distinct interactions that any one article reported parent::addRDF(parent::triplifyString($iid, parent::getVoc() . "minimum-number-interactions-reported", $id)); } else { if ($ns == "hpr") { // higher number of distinct interactions that any one article reports parent::addRDF(parent::triplifyString($iid, parent::getVoc() . "maximum-number-interactions-reported", $id)); } else { if ($ns = 'hp') { // total number of unique PMIDs used to support the interaction parent::addRDF(parent::triplifyString($iid, parent::getVoc() . "number-supporting-articles", $id)); } } } } // expansion method if ($a[15]) { $id = parent::getRes() . md5($a[15]); parent::addRDF(parent::describeIndividual($id, $a[15], parent::getVoc() . "Expansion-Method") . parent::describeClass(parent::getVoc() . "Expansion-Method", "Expansion Method") . parent::triplify($iid, parent::getVoc() . "expansion-method", $id)); } // host organism if ($a[28] != '-') { $data = $this->ParseStringArray($a[28]); $qname = trim($data["ns"]) . ":" . trim($data["id"]); parent::addRDF(parent::triplify($iid, parent::getVoc() . "host-organism", $qname)); } // @todo add to record // created 2010/05/18 $date = str_replace("/", "-", $a[30]) . "T00:00:00Z"; parent::addRDF(parent::triplifyString($iid, "dc:created", $date, "xsd:dateTime")); // taxon-sequence identical interaction group parent::addRDF(parent::triplify($iid, parent::getVoc() . "taxon-sequence-identical-interaction", $rigid) . parent::triplify($rigid, "rdf:type", parent::getVoc() . "Taxon-Sequence-Identical-Interaction") . parent::describeClass(parent::getVoc() . "Taxon-Sequence-Identical-Interaction", "Taxon + Sequence Identical Interaction") . parent::triplify($rigid, parent::getVoc() . "irigid", $irigid) . parent::triplify($rigid, parent::getVoc() . "interactor-a", $rogida) . parent::triplify($rogida, parent::getVoc() . "irogid", $irogida) . parent::triplify($rigid, parent::getVoc() . "interactor-b", $rogidb) . parent::triplify($rogidb, parent::getVoc() . "irogid", $irogidb) . parent::triplify($rogida, parent::getVoc() . "canonical-group", $crogida) . parent::triplify($rogidb, parent::getVoc() . "canonical-group", $crogidb) . parent::triplify($rigid, parent::getVoc() . "taxon-sequence-similar-interaction", $crigid) . parent::triplify($crigid, "rdf:type", parent::getVoc() . "Taxon-Sequence-Canonical-Interaction") . parent::describeClass(parent::getVoc() . "Taxon-Sequence-Canonical-Interaction", "Taxon + Sequence Canonical Interaction") . parent::triplify($crigid, parent::getVoc() . "icrigid", $icrigid) . parent::triplify($crigid, parent::getVoc() . "interactor-a-canonical-group", $crogida) . parent::triplify($crogida, "rdf:type", parent::getVoc() . "Taxon-Sequence-Similar-Group") . parent::triplify($crogida, parent::getVoc() . "icrogid", $icrogida) . parent::triplify($crigid, parent::getVoc() . "interactor-b-canonical-group", $crogidb) . parent::triplify($crogidb, "rdf:type", parent::getVoc() . "Taxon-Sequence-Similar-Group") . parent::triplify($crogidb, parent::getVoc() . "icrogid", $icrogidb) . parent::describeClass(parent::getVoc() . "Taxon-Sequence-Similar-Group", "Taxon + Sequence Similar Group")); parent::writeRDFBufferToWriteFile(); } }
public function Run() { $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); // make sure we have the zip archive //which files are to be converted? $selectedPackage = trim(parent::getParameterValue('files')); if ($selectedPackage == 'all') { $files = $this->getPackageMap(); } else { $sel_arr = explode(",", $selectedPackage); $pm = $this->getPackageMap(); $files = array(); foreach ($sel_arr as $a) { if (array_key_exists($a, $pm)) { $files[$a] = $pm[$a]; } } } $dataset_description = ''; foreach ($files as $key => $value) { $lfile = $ldir . $value['filename']; if (!file_exists($lfile) && parent::getParameterValue('download') == false) { trigger_error($lfile . " not found. Will attempt to download.", E_USER_NOTICE); $this->SetParameterValue('download', true); } //download all files [except mapping file] if ($this->GetParameterValue('download') == true) { $rfile = $value["file_url"]; echo "downloading " . var_dump($value["file_url"]) . " ... "; utils::downloadSingle($rfile, $lfile); } if ($key == "taxdmp" || $key == "gi2taxid_protein" || $key == "gi2taxid_nucleotide") { //get the name of the zip archive $lfile = $value["filename"]; // make sure we have the zip archive $zinfile = $ldir . $lfile; $zin = new ZipArchive(); if ($zin->open($zinfile) === FALSE) { trigger_error("Unable to open {$zinfile}"); exit; } //now iterate over the files in the ziparchive $source_file = (new DataResource($this))->setURI($value['file_url'])->setTitle('NCBI Taxonomy - ' . $key)->setRetrievedDate(date("Y-m-d\\TH:i:sP", filemtime($ldir . $lfile)))->setFormat('text/tab-separated-value')->setFormat('application/zip')->setPublisher('http://www.ncbi.nlm.nih.gov')->setHomepage('http://www.ncbi.nlm.nih.gov/taxonomy')->setRights('use')->setRights('attribution')->setLicense('https://www.nlm.nih.gov/copyright.html')->setDataset(parent::getDatasetURI()); $prefix = parent::getPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TH:i:sP"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} - {$key}")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/taxonomy/taxonomy.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("restricted-by-source-license")->setLicense("http://creativecommons/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); $dataset_description .= $output_file->toRDF() . $source_file->toRDF(); foreach ($value["contents"] as $k => $fn) { if ($k == "names" || $k == "nodes" || $k == "citations" || $k == "gencode" || $k == "division" || $k == "gi_taxid_prot" || $k == "gi_taxid_nucl") { //if($k !== 'citations') continue; $fpin = $zin->getStream($fn); if (!$fpin) { trigger_error("Unable to get pointer to {$fn} in {$zinfile}"); exit("failed\n"); } $gzoutfile = $odir . "taxonomy-{$k}" . "." . parent::getParameterValue('output_format'); //set the write file $gz = strstr(parent::getParameterValue('output_format'), 'gz') ? true : false; parent::setReadFile($ldir . $lfile); parent::getReadFile()->SetFilePointer($fpin); parent::setWriteFile($gzoutfile, $gz); echo "processing {$fn}...\n"; $this->{$k}(); $this->GetWriteFile()->Close(); echo "done!" . PHP_EOL; parent::clear(); } //if $k } //foreach } //if key taxdmp $this->setWriteFile($odir . $this->getBio2RDFReleaseFile()); $this->getWriteFile()->write($dataset_description); $this->getWriteFile()->close(); } }
function freq() { $cols = 10; $i = 1; parent::setCheckpoint('file'); while ($l = parent::getReadFile()->read()) { $a = explode("\t", str_replace("%", "", $l)); if (count($a) != $cols) { trigger_error("Expecting {$cols}, but found " . count($a) . " instead... skipping file!", E_USER_ERROR); return false; } list($stitch_flat, $stitch_stereo, $cui, $placebo, $freq, $freq_lower, $freq_upper, $concept_type, $meddra_concept_id, $meddra_concept_label) = $a; if ($concept_type == "LLT") { continue; } $meddra_concept_label = trim($meddra_concept_label); $id = "stitch_resource:" . md5("se_freq" . $l); $stitch_flat = "stitch:{$stitch_flat}"; $label = "{$meddra_concept_label} frequency for {$stitch_flat}"; parent::addRDF(parent::describeIndividual($id, $label, parent::getVoc() . "Drug-Effect-Frequency") . parent::describeClass(parent::getVoc() . "Drug-Effect-Frequency", "SIDER Drug-Effect and Frequency") . parent::triplify($id, parent::getVoc() . "drug", $stitch_flat) . parent::triplify($id, parent::getVoc() . "effect", "umls:" . $meddra_concept_id)); if ($placebo) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "placebo", "true", "xsd:boolean")); } $number = false; if (is_numeric($freq)) { $flabel = $freq . "%"; $ftype_label = "Exact-Frequency"; $ftype = parent::getVoc() . $ftype_label; $number = true; } else { $flabel = $freq; $ftype_label = "Qualitative-Frequency"; $ftype = parent::getVoc() . "{$ftype_label}"; } if ($freq_lower != $freq_upper) { $flabel .= "({$freq_lower}-{$freq_upper})"; $ftype_label = "Range-Frequency"; $ftype = parent::getVoc() . $ftype_label; } $fid = $id . md5($a[5] . $a[6] . $a[8]); parent::addRDF(parent::triplify($id, parent::getVoc() . "frequency", $fid) . parent::describeIndividual($fid, $flabel, $ftype) . parent::describeClass($ftype, $ftype_label)); if ($number == true) { parent::addRDF(parent::triplifyString($fid, parent::getVoc() . "frequency-value", $freq / 100)); } else { parent::addRDF(parent::triplifyString($fid, parent::getVoc() . "frequency-value", $freq)); } parent::addRDF(parent::triplifyString($fid, parent::getVoc() . "lower-frequency", sprintf("%.3f", $freq_lower)) . parent::triplifyString($fid, parent::getVoc() . "upper-frequency", sprintf("%.3f", $freq_upper))); parent::setCheckpoint('record'); } parent::setCheckpoint('file'); }
function OBO2RDF($abbv) { $abbv = strtolower($abbv); if ($abbv == "doid") { $abbv = "do"; } $minimal = parent::getParameterValue('detail') == 'min' ? true : false; $minimalp = parent::getParameterValue('detail') == 'min+' ? true : false; $version = parent::getParameterValue("bio2rdf_release"); $tid = ''; $first = true; $is_a = false; $is_deprecated = false; $min = $buf = ''; $ouri = "http://bio2rdf.org/lsr:" . $abbv; $dataset_uri = $abbv . "_resource:bio2rdf.dataset.{$abbv}.R" . $version; parent::setGraphURI($dataset_uri); $buf = parent::triplify($ouri, "rdf:type", "owl:Ontology"); $graph_uri = '<' . parent::getRegistry()->getFQURI(parent::getGraphURI()) . '>'; $bid = 1; while ($l = parent::getReadFile()->read()) { $lt = trim($l); if (strlen($lt) == 0) { continue; } if ($lt[0] == '!') { continue; } if (strstr($l, "[Term]")) { // first node? if ($first == true) { // ignore the first case $first = false; } else { if ($tid != '' && $is_a == false && $is_deprecated == false) { $t = parent::triplify($tid, "rdfs:subClassOf", "obo_vocabulary:Entity"); $buf .= $t; $min .= $t; } } $is_a = false; $is_deprecated = false; unset($typedef); $term = ''; $tid = ''; continue; } else { if (strstr($l, "[Typedef]")) { $is_a = false; $is_deprecated = false; unset($term); $tid = ''; $typedef = ''; continue; } } //echo "LINE: $l".PHP_EOL; // to fix error in obo generator $lt = str_replace("synonym ", "synonym: ", $lt); $lt = preg_replace("/\\{.*\\} !/", " !", $lt); $a = explode(" !", $lt); if (isset($a[1])) { $exc = trim($a[1]); } $a = explode(": ", trim($a[0]), 2); // let's go if (isset($intersection_of)) { if ($a[0] != "intersection_of") { // $intersection_of .= ")].".PHP_EOL; //$buf .= $intersection_of; if ($minimalp) { $min .= $intersection_of; } unset($intersection_of); } } if (isset($relationship)) { if ($a[0] != "relationship") { // $relationship .= ")].".PHP_EOL; //$buf .= $relationship; if ($minimalp) { $min .= $relationship; } unset($relationship); } } if (isset($typedef)) { if ($a[0] == "id") { $c = explode(":", $a[1]); if (count($c) == 1) { $ns = "obo"; $id = $c[0]; } else { $ns = strtolower($c[0]); $id = $c[1]; } $id = str_replace(array("(", ")"), array("_", ""), $id); $tid = $ns . ":" . $id; } else { if ($a[0] == "name") { $buf .= parent::describeClass($tid, addslashes(stripslashes($a[1]))); } else { if ($a[0] == "is_a") { if (FALSE !== ($pos = strpos($a[1], "!"))) { $a[1] = substr($a[1], 0, $pos - 1); } $buf .= parent::triplify($tid, "rdfs:subPropertyOf", "obo_vocabulary:" . strtolower($a[1])); } else { if ($a[0] == "is_obsolete") { $buf .= parent::triplify($tid, "rdf:type", "owl:DeprecatedClass"); $is_deprecated = true; } else { if ($a[0][0] == "!") { $a[0] = substr($a[0], 1); } $buf .= parent::triplifyString($tid, "obo_vocabulary:{$a['0']}", str_replace('"', '', stripslashes($a[1]))); } } } } } else { if (isset($term)) { if ($a[0] == "is_obsolete" && $a[1] == "true") { $t = parent::triplify($tid, "rdf:type", "owl:DeprecatedClass"); $t .= parent::triplify($tid, "rdfs:subClassOf", "owl:DeprecatedClass"); $min .= $t; $buf .= $t; $is_deprecated = true; } else { if ($a[0] == "id") { parent::getRegistry()->parseQName($a[1], $ns, $id); $tid = "{$ns}:{$id}"; // $buf .= parent::describeClass($tid,null,"owl:Class"); // $buf .= parent::triplify($tid,"rdfs:isDefinedBy",$ouri); } else { if ($a[0] == "name") { // $t = parent::triplifyString($tid,"rdfs:label",str_replace(array("\"", "'"), array("","\\\'"), stripslashes($a[1]))." [$tid]"); $label = str_replace(array("\"", "'"), array("", "\\\\'"), stripslashes($a[1])); $t = parent::describeIndividual($tid, $label, "owl:Class"); $t .= parent::triplify($tid, "rdfs:isDefinedBy", $ouri); $min .= $t; $buf .= $t; } else { if ($a[0] == "def") { $t = str_replace(array("'", "\"", "\\", "\\\\'"), array("\\\\'", "", "", ""), $a[1]); $min .= parent::triplifyString($tid, "dc:description", $t); $buf .= parent::triplifyString($tid, "dc:description", $t); } else { if ($a[0] == "property_value") { $b = explode(" ", $a[1]); $buf .= parent::triplifyString($tid, "obo_vocabulary:" . strtolower($b[0]), str_replace("\"", "", strtolower($b[1]))); } else { if ($a[0] == "xref") { // http://upload.wikimedia.org/wikipedia/commons/3/34/Anatomical_Directions_and_Axes.JPG // Medical Dictionary:http\://www.medterms.com/ // KEGG COMPOUND:C02788 "KEGG COMPOUND" // id-validation-regexp:\"REACT_[0-9\]\{1\,4}\\.[0-9\]\{1\,3}|[0-9\]+\" //$a[1] = 'id-validation-regexp:\"REACT_[0-9\]\{1\,4}\\.[0-9\]\{1\,3}|[0-9\]+\"'; if (substr($a[1], 0, 4) == "http") { $buf .= parent::triplify($tid, "rdfs:seeAlso", str_replace(array(" ", '"wiki"', "\\"), array("+", "", ""), $a[1])); } else { $b = explode(":", $a[1], 2); if (substr($b[1], 0, 4) == "http") { $buf .= parent::triplify($tid, "rdfs:seeAlso", stripslashes($b[1])); } else { $ns = str_replace(array(" ", "\\"), "", strtolower($b[0])); $id = trim($b[1]); // there may be a comment to remove if (FALSE !== ($pos = strrpos($id, ' "'))) { $comment = substr($id, $pos + 1, -1); $id = substr($id, 0, $pos); } $id = stripslashes($id); // there may be a source statement to remove $id = preg_replace("/{.*\\}/", "", $id); if ($ns == "pmid") { $ns = "pubmed"; $y = explode(" ", $id); $id = $y[0]; } if ($ns == "xx") { continue; } if ($ns == "icd9cm") { $y = explode(" ", $id); $id = $y[0]; } if ($ns == "xref; umls_cui") { continue; } if ($ns == "submitter") { $ns = "chebi.submitter"; } if ($ns == "wikipedia" || $ns == "mesh") { $id = str_replace(" ", "+", $id); } if ($ns == "id-validation-regexp") { $buf .= parent::triplifyString($tid, "obo_vocabulary:{$ns}", addslashes($id)); } else { $buf .= parent::triplify($tid, "obo_vocabulary:x-{$ns}", "{$ns}:" . str_replace(" ", "-", $id)); } } } } else { if ($a[0] == "synonym") { // synonym: "entidades moleculares" RELATED [IUPAC:] // synonym: "molecular entity" EXACT IUPAC_NAME [IUPAC:] // synonym: "Chondrococcus macrosporus" RELATED synonym [NCBITaxonRef:Krzemieniewska_and_Krzemieniewski_1926] //grab string inside double quotes preg_match('/"(.*)"(.*)/', $a[1], $matches); if (!empty($matches)) { $a[1] = str_replace(array("\\", "\"", "'"), array("", "", "\\\\'"), $matches[1] . $matches[2]); } else { $a[1] = str_replace(array("\"", "'"), array("", "\\\\'"), $a[1]); } $rel = "SYNONYM"; $list = array("EXACT", "BROAD", "RELATED", "NARROW"); $found = false; foreach ($list as $keyword) { // get everything after the keyword up until the bracket [ if (FALSE !== ($k_pos = strpos($a[1], $keyword))) { $str_len = strlen($a[1]); $keyword_len = strlen($keyword); $keyword_end_pos = $k_pos + $keyword_len; $b1_pos = strrpos($a[1], "["); $b2_pos = strrpos($a[1], "]"); $b_text = substr($a[1], $b1_pos + 1, $b2_pos - $b1_pos - 1); $diff = $b1_pos - $keyword_end_pos - 1; if ($diff != 0) { // then there is more stuff here $k = substr($a[1], $keyword_end_pos + 1, $diff); $rel = trim($k); } else { // create the long predicate $rel = $keyword . "_SYNONYM"; } $found = true; $str = substr($a[1], 0, $k_pos - 1); break; } } // check to see if we still haven't found anything if ($found === false) { // we didn't find one of the keywords // so take from the start to the bracket $b1_pos = strrpos($a[1], "["); $str = substr($a[1], 0, $b1_pos - 1); } $rel = str_replace(" ", "_", $rel); // $lit = addslashes($str.($b_text?" [".$b_text."]":"")); $l = parent::triplifyString($tid, "obo_vocabulary:" . strtolower($rel), $str); $buf .= $l; } else { if ($a[0] == "alt_id") { parent::getRegistry()->parseQname($a[1], $ns, $id); if ($id != 'curators') { $buf .= parent::triplify("{$ns}:{$id}", "rdfs:seeAlso", stripslashes($tid)); } } else { if ($a[0] == "is_a") { // do subclassing parent::getRegistry()->parseQName($a[1], $ns, $id); $t = parent::triplify($tid, "rdfs:subClassOf", "{$ns}:{$id}"); $buf .= $t; $min .= $t; $is_a = true; } else { if ($a[0] == "intersection_of") { if (!isset($intersection_of)) { // $intersection_of = '<'.parent::getRegistry()->getFQURI($tid).'> <'.parent::getRegistry()->getFQURI('owl:equivalentClass').'> [<'.parent::getRegistry()->getFQURI('rdf:type').'> <'.parent::getRegistry()->getFQURI('owl:Class').'>; <'.parent::getRegistry()->getFQURI('owl:intersectionOf').'> ('; $intersection_of = '<' . parent::getRegistry()->getFQURI($tid) . '> <' . parent::getRegistry()->getFQURI('owl:equivalentClass') . '> _:b' . ++$bid . " {$graph_uri} ." . PHP_EOL; $intersection_of .= '_:b' . $bid . ' <' . parent::getRegistry()->getFQURI('rdf:type') . '> <' . parent::getRegistry()->getFQURI('owl:Class') . "> {$graph_uri} ." . PHP_EOL; $intersection_of .= '_:b' . $bid . ' <' . parent::getRegistry()->getFQURI('owl:intersectionOf') . '> _:b' . ++$bid . " {$graph_uri} ." . PHP_EOL; } /* intersection_of: ECO:0000206 ! BLAST evidence intersection_of: develops_from VAO:0000092 ! chondrogenic condensation intersection_of: OBO_REL:has_part VAO:0000040 ! cartilage tissue */ $c = explode(" ", $a[1]); if (count($c) == 1) { // just a class parent::getRegistry()->parseQName($c[0], $ns, $id); $intersection_of .= '_:b' . $bid . ' <' . parent::getRegistry()->getFQURI('rdfs:subClassOf') . '> <' . parent::getRegistry()->getFQURI("{$ns}:{$id}") . "> {$graph_uri} ." . PHP_EOL; $buf .= parent::triplify($tid, "rdfs:subClassOf", "{$ns}:{$id}"); } else { if (count($c) == 2) { // an expression parent::getRegistry()->parseQName($c[0], $pred_ns, $pred_id); parent::getRegistry()->parseQName($c[1], $obj_ns, $obj_id); $intersection_of .= '_:b' . $bid . ' <' . parent::getRegistry()->getFQURI('owl:onProperty') . '> <' . parent::getRegistry()->getFQURI("obo_vocabulary:" . $pred_id) . "> {$graph_uri} ." . PHP_EOL; $intersection_of .= '_:b' . $bid . ' <' . parent::getRegistry()->getFQURI('owl:someValuesFrom') . '> <' . parent::getRegistry()->getFQURI("{$obj_ns}:{$obj_id}") . "> {$graph_uri} ." . PHP_EOL; $buf .= parent::triplify($tid, "obo_vocabulary:{$pred_id}", "{$obj_ns}:{$obj_id}"); } } } else { if ($a[0] == "relationship") { if (!isset($relationship)) { $relationship = '<' . parent::getRegistry()->getFQURI($tid) . '> <' . parent::getRegistry()->getFQURI('rdfs:subClassOf') . '> _:b' . ++$bid . " {$graph_uri} ." . PHP_EOL; $relationship .= '_:b' . $bid . ' <' . parent::getRegistry()->getFQURI('rdf:type') . '> <' . parent::getRegistry()->getFQURI('owl:Class') . "> {$graph_uri} ." . PHP_EOL; $relationship .= '_:b' . $bid . ' <' . parent::getRegistry()->getFQURI('owl:intersectionOf') . '> _:b' . ++$bid . " {$graph_uri} ." . PHP_EOL; } /* relationship: develops_from VAO:0000092 ! chondrogenic condensation relationship: OBO_REL:has_part VAO:0000040 ! cartilage tissue */ $c = explode(" ", $a[1]); if (count($c) == 1) { // just a class parent::getRegistry()->parseQName($c[0], $ns, $id); $relationship .= parent::getRegistry()->getFQURI("{$ns}:{$id}"); $buf .= parent::triplify($tid, "rdfs:subClassOf", "{$ns}:{$id}"); } else { if (count($c) == 2) { // an expression parent::getRegistry()->parseQName($c[0], $pred_ns, $pred_id); parent::getRegistry()->parseQName($c[1], $obj_ns, $obj_id); $relationship .= '_:b' . $bid . ' <' . parent::getRegistry()->getFQURI('owl:onProperty') . '> <' . parent::getRegistry()->getFQURI("obo_vocabulary:" . $pred_id) . "> {$graph_uri} ." . PHP_EOL; $relationship .= '_:b' . $bid . ' <' . parent::getRegistry()->getFQURI('owl:someValuesFrom') . '> <' . parent::getRegistry()->getFQURI("{$obj_ns}:{$obj_id}") . "> {$graph_uri} ." . PHP_EOL; $buf .= parent::triplify($tid, "obo_vocabulary:{$pred_id}", "{$obj_ns}:{$obj_id}"); } } } else { // default handler if (isset($a[1])) { $buf .= parent::triplifyString($tid, "obo_vocabulary:{$a['0']}", str_replace(array("\"", "'"), array("", "\\\\'"), stripslashes($a[1]))); } } } } } } } } } } } } } else { //header //format-version: 1.0 $buf .= parent::triplifyString($ouri, "obo_vocabulary:{$a['0']}", str_replace(array('"', '\\:'), array('\\"', ':'), isset($a[1]) ? $a[1] : "")); } } if ($minimal || $minimalp) { parent::getWriteFile()->write($min); } else { parent::getWriteFile()->write($buf); } $min = ''; $buf = ''; $header = ''; } //if(isset($intersection_of)) $buf .= $intersection_of.")].".PHP_EOL; //if(isset($relationship)) $buf .= $relationship.")].".PHP_EOL; if ($minimal || $minimalp) { parent::getWriteFile()->Write($min); } else { parent::getWriteFile()->write($buf); } }