/** * Convert pubchem substance XML record to RDF **/ function parse_substance_record(&$xml) { $root = $xml->GetXMLRoot(); // pubchem identifier and version $sid = array_shift($root->xpath('//PC-Substance_sid/PC-ID/PC-ID_id')); $sid_version = array_shift($root->xpath('//PC-Substance_sid/PC-ID/PC-ID_version')); $psid = $this->getPcsNs() . $sid; parent::addRDF(parent::describeIndividual($psid, null, $this->getPcsVoc() . "Substance")); parent::addRDF(parent::triplifyString($psid, $this->getPcsVoc() . "version", parent::safeLiteral($sid_version))); // reference to pubchem compounds $pc_compounds = $root->xpath('//PC-Substance_compound/PC-Compounds/PC-Compound'); foreach ($pc_compounds as $compound) { $cid = array_shift($compound->xpath('./PC-Compound_id/PC-CompoundType/PC-CompoundType_id/PC-CompoundType_id_cid')); $cid_type = array_shift($compound->xpath('./PC-Compound_id/PC-CompoundType/PC-CompoundType_type')); if ($cid != "") { $pcid = $this->getPccNs() . $cid; parent::addRDF(parent::triplify($psid, $this->getPcsVoc() . "compound", $pcid)); } } // database cross references (xref) // source identifier $source_id = array_shift($root->xpath('//PC-Substance_source/PC-Source/PC-Source_db/PC-DBTracking/PC-DBTracking_source-id/Object-id/Object-id_str')); parent::addRDF(parent::triplifyString($psid, $this->getPcsVoc() . "source-identifier", parent::safeLiteral($source_id))); // synonyms $synonyms = $root->xpath('//PC-Substance_synonyms/PC-Substance_synonyms_E'); foreach ($synonyms as $synonym) { parent::addRDF(parent::triplifyString($psid, $this->getPcsVoc() . "synonym", parent::safeLiteral($synonym))); } //comment $comments = $root->xpath('//PC-Substance_comment/PC-Substance_comment_E'); foreach ($comments as $comment) { if ($comment !== "") { parent::addRDF(parent::triplifyString($psid, "rdfs:comment", parent::safeLiteral($comment))); } } }
function models() { $tax_ids = array("Caenorhabditis elegans" => "6239", "Mus musculus" => "10090", "Saccharomyces cerevisiae" => "4932", "Drosophila melanogaster" => "7227", "Podospora anserina" => "5145", "Mesocricetus auratus" => "10036", "Schizosaccharomyces pombe" => "4896", "Danio rerio" => "7955"); $h = explode(",", parent::getReadFile()->read()); $expected_columns = 8; if (($n = count($h)) != $expected_columns) { trigger_error("Found {$n} columns in gene file - expecting {$expected_columns}!", E_USER_WARNING); return false; } /* [0] GenAge ID [1] symbol [2] name [3] organism [4] entrez gene id [5] avg lifespan change (max obsv) [6] lifespan effect [7] longevity influence */ while ($l = parent::getReadFile()->read(200000)) { $data = str_getcsv($l); $genage = str_pad($data[0], 4, "0", STR_PAD_LEFT); $gene_symbol = $data[1]; $name = $data[2]; $organism = $data[3]; $ncbi_gene_id = $data[4]; $max_percent_obsv_avg_lifespan_change = $data[5]; $lifespan_effect = $data[6]; $longevity_influence = $data[7]; $genage_id = parent::getNamespace() . $genage; parent::addRDF(parent::describeIndividual($genage_id, $name, parent::getVoc() . "Aging-Related-Gene") . parent::describeClass(parent::getVoc() . "Aging-Related-Gene", "Aging Related Gene")); parent::addRDF(parent::triplifyString($genage_id, parent::getVoc() . "gene-symbol", parent::safeLiteral($gene_symbol))); parent::addRDF(parent::triplify($genage_id, parent::getVoc() . "taxon", "ncbitaxon:" . $tax_ids[$organism])); if ($ncbi_gene_id !== "") { parent::addRDF(parent::triplify($genage_id, parent::getVoc() . "x-ncbigene", "ncbigene:" . $ncbi_gene_id)); } if ($max_percent_obsv_avg_lifespan_change !== "") { parent::addRDF(parent::triplifyString($genage_id, parent::getVoc() . "maximum-percent-observed-average-lifespan-change", parent::safeLiteral($max_percent_obsv_avg_lifespan_change))); } if ($lifespan_effect == "Increase and Decrease") { parent::addRDF(parent::triplifyString($genage_id, parent::getVoc() . "lifespan-effect", "increase") . parent::triplifyString($genage_id, parent::getVoc() . "lifespan-effect", "decrease")); } else { parent::addRDF(parent::triplifyString($genage_id, parent::getVoc() . "lifespan-effect", strtolower($lifespan_effect))); } parent::addRDF(parent::triplifyString($genage_id, parent::getVoc() . "longevity-influence", strtolower($longevity_influence))); parent::WriteRDFBufferToWriteFile(); } }
function genes() { $h = explode("\t", parent::getReadFile()->read()); $expected_columns = 14; if (($n = count($h)) != $expected_columns) { trigger_error("Found {$n} columns in gene file - expecting {$expected_columns}!", E_USER_WARNING); return false; } while ($l = parent::getReadFile()->read(200000)) { $a = explode("\t", $l); $id = parent::getNamespace() . $a[0]; $label = $a[3]; $this->genes[$a[0]] = $a[3]; parent::addRDF(parent::describeIndividual($id, $label, parent::getVoc() . "Gene") . parent::describeClass(parent::getVoc() . "Gene", "PharmGKB Gene")); // link data parent::addRDF(parent::triplify($id, "rdfs:seeAlso", "http://pharmgkb.org/gene/" . $a[0]) . parent::triplify($id, "rdfs:seeAlso", "http://www4.wiwiss.fu-berlin.de/diseasome/resource/genes/" . $a[0]) . parent::triplify($id, "rdfs:seeAlso", "http://dbpedia.org/resource/" . $a[0])); if ($a[1]) { parent::addRDF(parent::triplify($id, parent::getVoc() . "x-ncbigene", "ncbigene:" . $a[1])); } if ($a[2]) { parent::addRDF(parent::triplify($id, parent::getVoc() . "x-ensembl", "ensembl:" . $a[2])); } if ($a[3]) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "name", $a[3]) . parent::describeProperty(parent::getVoc() . "name", "Relationship between a PharmGKB entity and its name")); } if ($a[4]) { parent::addRDF(parent::triplify($id, parent::getVoc() . "symbol", "symbol:" . $a[4]) . parent::describeProperty(parent::getVoc() . "symbol", "Relationship between a PharmGKB gene and a gene symbol")); } if ($a[5]) { $b = explode('","', substr($a[5], 1, -2)); foreach ($b as $alt_name) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "alternative-name", parent::safeLiteral(trim(stripslashes($alt_name))))); } parent::addRDF(parent::describeProperty(parent::getVoc() . "alternative-name", "Relationship between a PharmGKB gene and an alternative name")); } if ($a[6]) { // these are not hgnc symbols $b = explode('","', substr($a[6], 1, -2)); foreach ($b as $alt_symbol) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "alternate-symbol", trim($alt_symbol))); } parent::addRDF(parent::describeProperty($id, parent::getVoc() . "alternate-symbol", "Relationship between a PharmGKB gene and an alternate gene symbol")); } if ($a[7]) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "is-vip", $a[7]) . parent::describeProperty(parent::getVoc() . "is-vip", "Relationship between a PharmGKB gene and its vip status")); } if ($a[8]) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "has-variant-annotation", $a[8]) . parent::describeProperty(parent::getVoc() . "has-variant-annotation", "Relationship between a PharmGKB gene and whether it has a variant annotation")); } if ($a[9]) { $b = explode(",", $a[9]); foreach ($b as $xref) { $xref = trim($xref); if (!$xref) { continue; } $url = false; $x = $this->MapXrefs($xref, $url, $ns, $id2); $ns = str_replace(' ', '', $ns); if ($url == true) { parent::addRDF(parent::QQuadO_URL($id, parent::getVoc() . "x-{$ns}", $x)); } else { parent::addRDF(parent::triplify($id, parent::getVoc() . "x-{$ns}", $x)); } } } if ($a[10]) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "cpic-dosing-guideline", $a[10])); } if ($a[11]) { parent::addRDF(parent::triplifyString($id, parent::getVoc() . "chromosome", $a[11]) . parent::describeProperty(parent::getVoc() . "chrosomome", "Relationship between a PharmGKB gene and its chromosomal position") . parent::triplifyString($id, parent::getVoc() . "chromosome-start", $a[12]) . parent::triplifyString($id, parent::getVoc() . "chromosome-end", $a[13])); } parent::WriteRDFBufferToWriteFile(); } }
function process() { $gb_record_str = ""; while ($aLine = $this->getReadFile()->Read(4096)) { preg_match("/^\\/\\/\$/", $aLine, $matches); if (count($matches)) { //now remove the header if it is there $gb_record_str = $this->removeHeader($gb_record_str); $sectionsRaw = $this->parseGenbankRaw($gb_record_str); /** * SECTIONS being parsed: * locus, definition, accession, version, keywords, segment, source, reference, features */ //get locus section(s) $locus = $this->retrieveSections("LOCUS", $sectionsRaw); $parsed_locus_arr = $this->parseLocus($locus); //get the definition section $definition = $this->retrieveSections("DEFINITION", $sectionsRaw); $parsed_definition_arr = $this->parseDefinition($definition); //get the accession $accessions = $this->retrieveSections("ACCESSION", $sectionsRaw); $parsed_accession_arr = $this->parseAccession($accessions); //get the version $versions = $this->retrieveSections("VERSION", $sectionsRaw); $parsed_version_arr = $this->parseVersion($versions); //get the keywords $keywords = $this->retrieveSections("KEYWORDS", $sectionsRaw); $parsed_keyword_arr = $this->parseKeywords($keywords); //may not be any segment section $segments = $this->retrieveSections("SEGMENT", $sectionsRaw); if (!empty($segments)) { $parsed_segments_arr = $this->parseSegment($segments); } $features = $this->retrieveSections("FEATURES", $sectionsRaw); $parsed_features_arr = $this->parseFeatures($features); //get the source section $source = $this->retrieveSections("SOURCE", $sectionsRaw); $parsed_source_arr = $this->parseSource($source); $contig = $this->retrieveSections("CONTIG", $sectionsRaw); if (!empty($contig)) { $parsed_contig_arr = $this->parseContig($contig); } //get the reference section $references = $this->retrieveSections("REFERENCE", $sectionsRaw); $parsed_refs_arr = $this->parseReferences($references); $gb_res = "gi:" . $parsed_version_arr['gi']; $gb_label = utf8_encode(htmlspecialchars($parsed_definition_arr[0])); parent::AddRDF(parent::describeIndividual($gb_res, $gb_label, $this->getVoc() . "genbank-record") . parent::triplifyString($gb_res, $this->getVoc() . 'sequence-length', $parsed_locus_arr[0]['sequence_length']) . parent::triplifyString($gb_res, $this->getVoc() . 'strandedness', $parsed_locus_arr[0]['strandedness']) . parent::triplify($gb_res, "rdf:type", $this->getRes() . $parsed_locus_arr[0]['mol_type']) . parent::triplifyString($gb_res, $this->getVoc() . 'chromosome-shape', $parsed_locus_arr[0]['chromosome_shape']) . parent::triplifyString($gb_res, $this->getVoc() . 'division-name', $parsed_locus_arr[0]['division_name']) . parent::triplifyString($gb_res, $this->getVoc() . 'date-of-entry', $parsed_locus_arr[0]['date']) . parent::triplifyString($gb_res, $this->getVoc() . 'source', utf8_encode($parsed_source_arr[0])) . parent::QQuadO_URL($gb_res, $this->getVoc() . 'fasta-seq', 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?sendto=on&db=nucest&dopt=fasta&val=' . $parsed_version_arr['gi'])); foreach ($parsed_features_arr as $aFeature) { //getFeatures $type = $aFeature['type']; $feat_desc = $this->getFeatures($type); $label = preg_replace('/\\s\\s*/', ' ', $feat_desc['definition']); $comment = null; $value = $aFeature['value']; $value_arr = explode("/", $value); $location = preg_replace('/\\n/', '', $value_arr[0]); $class_id = parent::getVoc() . md5($type); $feat_res = parent::getRes() . md5($type . $location . $gb_res); $feat_label = utf8_encode($type . " " . $location . " for " . $gb_res); if (isset($feat_desc['comment'])) { $comment = $feat_desc['comment']; $comment = preg_replace('/\\s\\s*/', ' ', $comment); $label .= " " . $comment; } parent::AddRDF(parent::describeClass($class_id, $label, parent::getVoc() . "Feature") . parent::describeIndividual($feat_res, $feat_label, $class_id) . parent::triplify($gb_res, $this->getVoc() . "has-feature", $feat_res)); foreach ($value_arr as $aL) { //check if aL has an equals in it $p = "/(\\S+)\\=(.*)/"; preg_match($p, $aL, $m); if (count($m)) { if ($m[1] == "db_xref") { parent::AddRDF(parent::triplify($feat_res, "rdfs:seeAlso", str_replace("\"", "", $m[2]))); } else { parent::AddRDF(parent::triplifyString($feat_res, $this->getVoc() . $m[1], utf8_encode(str_replace("\"", "", $m[2])))); } } } } foreach ($parsed_accession_arr[0] as $acc) { parent::AddRDF(parent::triplifyString($gb_res, $this->getVoc() . "accession", $acc)); } if (isset($parsed_version_arr['versioned_accession'])) { parent::AddRDF(parent::triplifyString($gb_res, $this->getVoc() . "versioned-accession", $parsed_version_arr['versioned_accession'])); } if (isset($parsed_contig_arr)) { foreach ($parsed_contig_arr as $aContig) { parent::AddRDF(parent::triplifyString($gb_res, $this->getVoc() . "contig", parent::safeLiteral($aContig))); } } foreach ($parsed_keyword_arr as $akw) { parent::AddRDF(parent::triplifyString($gb_res, $this->getVoc() . "keyword", $akw)); } if (isset($parsed_segments_arr)) { foreach ($parsed_segments_arr as $aSeg) { parent::AddRDF(parent::triplifyString($gb_res, $this->getVoc() . "segment-number", $aSeg['segment_number']) . parent::triplifyString($gb_res, $this->getVoc() . "total-segments", $aSeg['total_segments'])); } } foreach ($parsed_refs_arr as $aRef) { $r = rand(); $ref_res = $this->getRes() . md5($r); $ref_label = "reference for " . $gb_res; if (isset($aRef['TITLE'])) { parent::AddRDF(parent::describeIndividual($ref_res, $ref_label, $this->getVoc() . "reference") . parent::triplifyString($ref_res, $this->getVoc() . "title", $aRef['TITLE'])); } if (isset($aRef['PUBMED'])) { parent::AddRDF(parent::triplify($ref_res, $this->getVoc() . "x-pubmed", 'pubmed:' . $aRef['PUBMED'])); } if (isset($aRef['AUTHORS'])) { parent::AddRDF(parent::triplifyString($ref_res, $this->getVoc() . "authors", $aRef['AUTHORS'])); } parent::AddRDF(parent::triplify($gb_res, $this->getVoc() . "reference", $ref_res) . parent::triplifyString($ref_res, $this->getVoc() . "coordinates", $aRef['COORDINATES']) . parent::triplifyString($ref_res, $this->getVoc() . "citation", $aRef['JOURNAL'])); } $gb_record_str = ""; $this->WriteRDFBufferToWriteFile(); continue; } preg_match("/^\n\$/", $aLine, $matches); if (count($matches) == 0) { $gb_record_str .= $aLine; } } //while }