Beispiel #1
0
 private function process()
 {
     $z = 0;
     $y = 1;
     while ($l = $this->getReadFile()->Read(200000)) {
         if ($z++ % 1000000 == 0) {
             echo $z . PHP_EOL;
             $odir = parent::getParameterValue('outdir');
             $ofile = 'iproclass.' . $y++ . "." . parent::getParameterValue('output_format');
             $gz = strstr(parent::getParameterValue('output_format'), "gz") ? true : false;
             if (parent::getWriteFile() != null) {
                 parent::getWriteFile()->close();
                 parent::clear();
             }
             // generate a new file
             parent::setWriteFile($odir . $ofile, $gz);
         }
         $fields = explode("\t", $l);
         @($uniprot_acc = $fields[0]);
         @($uniprot = $fields[1]);
         @($gene = $fields[2]);
         @($refseq = $fields[3]);
         @($gi = $fields[4]);
         @($pdb = $fields[5]);
         @($pfam = $fields[6]);
         @($go = $fields[7]);
         @($pirsf = $fields[8]);
         @($ipi = $fields[9]);
         @($uniref_100 = $fields[10]);
         @($uniref_90 = $fields[11]);
         @($uniref_50 = $fields[12]);
         @($uniparc = $fields[13]);
         //skipping pir-psd because db no longer maintained
         @($ncbi_taxonomy = $fields[15]);
         @($mim = $fields[16]);
         @($unigene = $fields[17]);
         @($ensembl = $fields[18]);
         @($pubmed = $fields[19]);
         @($embl_genbank_ddbj = $fields[20]);
         @($embl_protein = trim($fields[21]));
         $id = $uniprot_acc;
         $id_res = $this->getNamespace() . $id;
         $id_label = "iproclass entry for uniprot:{$uniprot_acc}";
         parent::addRDF(parent::triplify($id_res, $this->getVoc() . "x-uniprot", "uniprot:" . $uniprot_acc));
         if (!empty($uniprot)) {
             $uniprot_ids = explode("; ", $uniprot);
             foreach ($uniprot_ids as $uniprot_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-uniprot", "uniprot:" . $uniprot_id));
             }
         }
         if (!empty($gene)) {
             $gene_ids = explode("; ", $gene);
             foreach ($gene_ids as $gene_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ncbigene", "geneid:" . $gene_id));
             }
         }
         if (!empty($refseq)) {
             $refseq_ids = explode("; ", $refseq);
             foreach ($refseq_ids as $refseq_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-refseq", "refseq:" . $refseq_id));
             }
         }
         if (!empty($gi)) {
             $gi_ids = explode("; ", $gi);
             foreach ($gi_ids as $gi_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-gi", "gi:" . $gi_id));
             }
         }
         if (!empty($pdb)) {
             $pdb_ids = explode("; ", $pdb);
             foreach ($pdb_ids as $pdb_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pdb", "pdb:" . $pdb_id));
             }
         }
         if (!empty($pfam)) {
             $pfam_ids = explode("; ", $pfam);
             foreach ($pfam_ids as $pfam_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pfam", "pfam:" . $pfam_id));
             }
         }
         if (!empty($go)) {
             $go_ids = explode("; ", $go);
             foreach ($go_ids as $go_id) {
                 $go_id = substr($go_id, 3);
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-go", "go:" . $go_id));
             }
         }
         if (!empty($pirsf)) {
             $pirsf_ids = explode("; ", $pirsf);
             foreach ($pirsf_ids as $pirsf_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pirsf", "pirsf:" . $pirsf_id));
             }
         }
         if (!empty($ipi)) {
             $ipi_ids = explode("; ", $ipi);
             foreach ($ipi_ids as $ipi_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ipi", "ipi:" . $ipi_id));
             }
         }
         if (!empty($uniref_100)) {
             $uniref_100_ids = explode("; ", $uniref_100);
             foreach ($uniref_100_ids as $uniref_100_id) {
                 parent::AddRDF(parent::QQuadO_URL($id_res, "rdfs:seeAlso", "http://uniprot.org/uniref/" . $uniref_100_id));
             }
         }
         if (!empty($uniref_90)) {
             $uniref_90_ids = explode("; ", $uniref_90);
             foreach ($uniref_90_ids as $uniref_90_id) {
                 parent::AddRDF(parent::QQuadO_URL($id_res, "rdfs:seeAlso", "http://uniprot.org/uniref/" . $uniref_90_id));
             }
         }
         if (!empty($uniref_50)) {
             $uniref_50_ids = explode("; ", $uniref_50);
             foreach ($uniref_50_ids as $uniref_50_id) {
                 parent::AddRDF(parent::QQuadO_URL($id_res, "rdfs:seeAlso", "http://uniprot.org/uniref/" . $uniref_50_id));
             }
         }
         if (!empty($uniparc)) {
             $uniparc_ids = explode("; ", $uniparc);
             foreach ($uniparc_ids as $uniparc_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-uniparc", "uniparc:" . $uniparc_id) . parent::QQuadO_URL($id_res, "rdfs:seeAlso", "http://uniprot.org/uniparc/" . $uniparc_id));
             }
         }
         if (!empty($ncbi_taxonomy)) {
             $taxonomy_ids = explode("; ", $ncbi_taxonomy);
             foreach ($taxonomy_ids as $taxonomy_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-taxon", "taxon:" . $taxonomy_id));
             }
         }
         if (!empty($mim)) {
             $mim_ids = explode("; ", $mim);
             foreach ($mim_ids as $mim_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-omim", "omim:" . $mim_id));
             }
         }
         if (!empty($unigene)) {
             $unigene_ids = explode("; ", $unigene);
             foreach ($unigene_ids as $unigene_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-unigene", "unigene:" . $unigene_id));
             }
         }
         if (!empty($ensembl)) {
             $ensembl_ids = explode("; ", $ensembl);
             foreach ($ensembl_ids as $ensembl_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ensembl", "ensembl:" . $ensembl_id));
             }
         }
         if (!empty($pubmed)) {
             $pubmed_ids = explode("; ", $pubmed);
             foreach ($pubmed_ids as $pubmed_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pubmed", "pubmed:" . $pubmed_id));
             }
         }
         if (!empty($embl_genbank_ddbj)) {
             $genbank_ids = explode("; ", $embl_genbank_ddbj);
             foreach ($genbank_ids as $genbank_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-genbank", "genbank:" . $genbank_id));
             }
         }
         if (!empty($embl_protein)) {
             $embl_protein_ids = explode(";", $embl_protein);
             foreach ($embl_protein_ids as $embl_protein_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-genbank", "genbank:" . $embl_protein_id));
             }
         }
         //write rdf to file
         $this->WriteRDFBufferToWriteFile();
     }
     //while
 }
Beispiel #2
0
 function drugs()
 {
     $declared = '';
     $h = explode("\t", $this->GetReadFile()->Read(1000));
     // first line is header
     if (count($h) != 10) {
         trigger_error("Change in number of columns for drugs file", E_USER_ERROR);
         return FALSE;
     }
     while ($l = $this->GetReadFile()->Read(200000)) {
         $a = explode("\t", $l);
         $id = parent::getNamespace() . $a[0];
         $this->drugs[$a[0]] = $a[1];
         parent::addRDF(parent::describeIndividual($id, $a[1], parent::getVoc() . "Drug") . parent::describeClass(parent::getVoc() . "Drug", "PharmGKB Drug"));
         if (trim($a[2])) {
             // generic names
             // Entacapona [INN-Spanish],Entacapone [Usan:Inn],Entacaponum [INN-Latin],entacapone
             $b = explode(',', trim($a[2]));
             foreach ($b as $c) {
                 parent::addRDF(parent::triplifyString($id, parent::getVoc() . "generic_name", str_replace('"', '', $c)));
             }
             parent::addRDF(parent::describeProperty(parent::getVoc() . "generic_name", "Relationship between a PharmGKB drug and a generic name"));
         }
         if (trim($a[3])) {
             // trade names
             //Disorat,OptiPranolol,Trimepranol
             $b = explode(',', trim($a[3]));
             foreach ($b as $c) {
                 parent::addRDF(parent::triplifyString($id, parent::getVoc() . "trade_name", str_replace(array("'", "\""), array("\\\\'", ""), $c)));
             }
             parent::addRDF(parent::describeProperty(parent::getVoc() . "trade_name", "Relationship between a PharmGKB drug and a trade name"));
         }
         if (trim($a[4])) {
             // Brand Mixtures
             // Benzyl benzoate 99+ %,"Dermadex Crm (Benzoic Acid + Benzyl Benzoate + Lindane + Salicylic Acid + Zinc Oxide + Zinc Undecylenate)",
             $b = explode(',', trim($a[4]));
             foreach ($b as $c) {
                 parent::addRDF(parent::triplifyString($id, parent::getVoc() . "brand_mixture", str_replace(array("'", "\""), array("\\\\'", ""), $c)));
             }
             parent::addRDF(parent::describeProperty(parent::getVoc() . "brand_mixture", "Relationship between a PharmGKB drug and a brand mixture"));
         }
         if (trim($a[5])) {
             // Type
             parent::addRDF(parent::triplifyString($id, parent::getVoc() . "drug_class", str_replace(array("'", "\""), array("\\\\'", ""), $a[5])) . parent::describeProperty(parent::getVoc() . "drug_class", "Relationship between a PharmGKB drug and its drug class"));
         }
         if (trim($a[6])) {
             // Cross References
             // drugBank:DB00789,keggDrug:D01707,pubChemCompound:55466,pubChemSubstance:192903,url:http://en.wikipedia.org/wiki/Gadopentetate_dimeglumine
             $b = explode(',', trim(str_replace('"', '', $a[6])));
             foreach ($b as $c) {
                 $this->getRegistry()->parseQName($c, $ns, $id1);
                 $ns = str_replace(array('"', ' '), '', $ns);
                 $ns = str_replace(array('keggcompound', 'keggdrug', 'drugbank', 'uniprotkb', 'clinicaltrials.gov', 'drugsproductdatabase(dpd)', 'nationaldrugcodedirectory', 'therapeutictargetsdatabase', 'fdadruglabelatdailymed'), array('kegg', 'kegg', 'drugbank', 'uniprot', 'clinicaltrials', 'dpd', 'ndc', 'ttd', 'dailymed'), strtolower(str_replace('"', '', $ns)));
                 if ($ns == "url") {
                     parent::addRDF(parent::QQuadO_URL($id, "rdfs:seeAlso", $id));
                 } else {
                     parent::addRDF(parent::triplify($id, parent::getVoc() . "x-" . $ns, $ns . ":" . $id1));
                 }
             }
         }
         if (trim($a[9])) {
             // External Vocabulary
             // ATC:H01AC(Somatropin and somatropin agonists),ATC:V04CD(Tests for pituitary function)
             // ATC:D07AB(Corticosteroids, moderately potent (group II)) => this is why you don't use brackets and commas as separators.
             $b = explode(',', trim($a[9]), 2);
             foreach ($b as $c) {
                 preg_match_all("/ATC:([A-Z0-9]+)\\((.*)\\)\$/", $c, $m);
                 if (isset($m[1][0])) {
                     $atc = "atc:" . $m[1][0];
                     parent::addRDF(parent::triplify($id, parent::getVoc() . "x-atc", $atc));
                     if (!isset($declared[$atc])) {
                         $declared[$atc] = '';
                         parent::addRDF(parent::triplifyString($atc, "rdfs:label", $m[2][0]));
                     }
                 }
             }
         }
         parent::WriteRDFBufferToWriteFile();
     }
 }
Beispiel #3
0
 function process()
 {
     $header = $this->GetReadFile()->Read(200000);
     $header_arr = explode("\t", $header);
     $n = 41;
     $c = count($header_arr);
     if ($c != $n) {
         echo PHP_EOL;
         print_r($header_arr);
         trigger_error("Expected {$n} columns, found {$c} . please update the script", E_USER_ERROR);
         exit;
     }
     while ($l = $this->GetReadFile()->Read(4096)) {
         $fields = explode("\t", $l);
         $id = strtolower($fields[0]);
         $approved_symbol = $fields[1];
         $approved_name = $fields[2];
         $status = $fields[3];
         $locus_type = $fields[4];
         $locus_group = $fields[5];
         $previous_symbols = $fields[6];
         $previous_names = $fields[7];
         $synonyms = $fields[8];
         $name_synonyms = $fields[9];
         $chromosome = $fields[10];
         $date_approved = $fields[11];
         $date_modified = $fields[12];
         $date_symbol_changed = $fields[13];
         $date_name_changed = $fields[14];
         $accession_numbers = $fields[15];
         $enzyme_ids = $fields[16];
         $entrez_gene_id = $fields[17];
         $ensembl_gene_id = $fields[18];
         $mouse_genome_database_id = $fields[19];
         $specialist_database_links = $fields[20];
         $specialist_database_ids = $fields[21];
         $pubmed_ids = $fields[22];
         $refseq_ids = $fields[23];
         $gene_family_tag = $fields[24];
         $gene_family_description = $fields[25];
         $record_type = $fields[26];
         $primary_ids = $fields[27];
         $secondary_ids = $fields[28];
         $ccd_ids = $fields[29];
         $vega_ids = $fields[30];
         $locus_specific_databases = $fields[31];
         $entrez_gene_id_mappeddatasuppliedbyNCBI = $fields[32];
         $omim_id_mappeddatasuppliedbyNCBI = $fields[33];
         $refseq_mappeddatasuppliedbyNCBI = $fields[34];
         $uniprot_id_mappeddatasuppliedbyUniProt = $fields[35];
         $ensembl_id_mappeddatasuppliedbyEnsembl = $fields[36];
         $vega_id_mappeddatasuppliedbyVega = $fields[37];
         $ucsc_id_mappeddatasuppliedbyUCSC = $fields[38];
         $mouse_genome_database_id_mappeddatasuppliedbyMGI = $fields[39];
         $rat_genome_database_id_mappeddatasuppliedbyRGD = $fields[40];
         $id_res = $id;
         $id_label = "Gene Symbol for " . $approved_symbol;
         parent::AddRDF(parent::triplify($id_res, "rdf:type", $this->getVoc() . "Gene-Symbol") . parent::describeIndividual($id_res, $id_label, $this->getVoc() . "Gene-Symbol") . parent::describeClass($this->getVoc() . "Gene-Symbol", "HGNC Official Gene Symbol"));
         if (!empty($approved_symbol)) {
             $s = "hgnc.symbol:" . $approved_symbol;
             parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "approved-symbol", utf8_encode(htmlspecialchars($approved_symbol))) . parent::describeProperty($this->getVoc() . "approved-symbol", "HGNC approved gene symbol", "The official gene symbol that has been approved by the HGNC and is publicly available. Symbols are approved based on specific HGNC nomenclature guidelines. In the HTML results page this ID links to the HGNC Symbol Report for that gene") . parent::describeIndividual($s, $approved_symbol, parent::getVoc() . "Approved-Gene-Symbol") . parent::describeClass(parent::getVoc() . "Approved-Gene-Symbol", "Approved Gene Symbol") . parent::triplify($id_res, parent::getVoc() . "has-approved-symbol", $s) . parent::triplify($s, parent::getVoc() . "is-approved-symbol-of", $id_res));
         }
         if (!empty($approved_name)) {
             parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "approved-name", utf8_encode(htmlspecialchars($approved_name))) . parent::describeProperty($this->getVoc() . "approved-name", "HGNC approved name", "The official gene name that has been approved by the HGNC and is publicly available. Names are approved based on specific HGNC nomenclature guidelines."));
         }
         if (!empty($status)) {
             $s = $this->getVoc() . str_replace(" ", "-", $status);
             parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "status", $s) . parent::describeProperty($this->getVoc() . "status", "HGNC status", "Indicates whether the gene is classified as: Approved - these genes have HGNC-approved gene symbols. Entry withdrawn - these previously approved genes are no longer thought to exist. Symbol withdrawn - a previously approved record that has since been merged into a another record.") . parent::describeClass($s, $status, $this->getVoc() . "Status"));
         }
         if (!empty($locus_id)) {
             $locus_res = $this->getRes() . $id . "_LOCUS";
             parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "locus", $locus_res) . parent::triplifyString($locus_res, $this->getVoc() . "locus-type", utf8_encode(htmlspecialchars($locus_type))) . parent::triplifyString($locus_res, $this->getVoc() . "locus-group", utf8_encode(htmlspecialchars($locus_group))) . parent::describeProperty($this->getVoc() . "locus-type", "locus type", "Specifies the type of locus described by the given entry") . parent::describeProperty($this->getVoc() . "locus-group", "locus group", "Groups locus types together into related sets. Below is a list of groups and the locus types within the group"));
         }
         if (!empty($previous_symbols)) {
             $previous_symbols = explode(", ", $previous_symbols);
             foreach ($previous_symbols as $previous_symbol) {
                 $previous_symbol_uri = "hgnc.symbol:" . $previous_symbol;
                 parent::AddRDF(parent::describeIndividual($previous_symbol_uri, $previous_symbol, parent::getVoc() . "Previous-Symbol") . parent::describeClass(parent::getVoc() . "Previous-Symbol", "Previous Symbol") . parent::triplify($id_res, $this->getVoc() . "previous-symbol", $previous_symbol_uri) . parent::describeProperty($this->getVoc() . "previous-symbol", "HGNC previous symbol", "Symbols previously approved by the HGNC for this gene"));
             }
         }
         if (!empty($previous_names)) {
             $previous_names = explode(", ", $previous_names);
             foreach ($previous_names as $previous_name) {
                 $previous_name = str_replace("\"", "", $previous_name);
                 parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "previous-name", utf8_encode(htmlspecialchars($previous_name))) . parent::describeProperty($this->getVoc() . "previous-name", "HGNC previous name", "Gene names previously approved by the HGNC for this gene"));
             }
         }
         if (!empty($synonyms)) {
             $synonyms = explode(", ", $synonyms);
             foreach ($synonyms as $synonym) {
                 parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "synonym", utf8_encode(htmlspecialchars($synonym))) . parent::describeProperty($this->getVoc() . "synonym", "synonym", "Other symbols used to refer to this gene"));
             }
         }
         if (!empty($name_synonyms)) {
             $name_synonyms = explode(", ", $name_synonyms);
             foreach ($name_synonyms as $name_synonym) {
                 $name_synonym = str_replace("\"", "", $name_synonym);
                 parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "name-synonym", utf8_encode(htmlspecialchars($name_synonym))) . parent::describeProperty($this->getVoc() . "name-synonym", "name synonym", "Other names used to refer to this gene"));
             }
         }
         if (!empty($chromosome)) {
             parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "chromosome", utf8_encode(htmlspecialchars($chromosome))) . parent::describeProperty($this->getVoc() . "chromosome", "chromosome", "Indicates the location of the gene or region on the chromosome"));
         }
         if (!empty($date_approved)) {
             parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "date-approved", $date_approved, "xsd:date") . parent::describeProperty($this->getVoc() . "date-approved", "date approved", "Date the gene symbol and name were approved by the HGNC"));
         }
         if (!empty($date_modified)) {
             parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "date-modified", $date_modified, "xsd:date") . parent::describeProperty($this->getVoc() . "date-modified", "date modified", "the date the entry was modified by the HGNC"));
         }
         if (!empty($date_symbol_changed)) {
             parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "date-symbol-changed", $date_symbol_changed, "xsd:date") . parent::describeProperty($this->getVoc() . "date-symbol-changed", "date symbol changed", "The date the gene symbol was last changed by the HGNC from a previously approved symbol. Many genes receive approved symbols and names which are viewed as temporary (eg C2orf#) or are non-ideal when considered in the light of subsequent information. In the case of individual genes a change to the name (and subsequently the symbol) is only made if the original name is seriously misleading"));
         }
         if (!empty($date_name_changed)) {
             parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "date-name-changed", $date_name_changed, "xsd:date") . parent::describeProperty($this->getVoc() . "date-name-changed", "date name changed", "The date the gene name was last changed by the HGNC from a previously approved name"));
         }
         if (!empty($accession_numbers)) {
             $accession_numbers = explode(", ", $accession_numbers);
             foreach ($accession_numbers as $accession_number) {
                 parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "accession", utf8_encode(htmlspecialchars($accession_number))) . parent::describeProperty($this->getVoc() . "accession", "accession number", "Accession numbers for each entry selected by the HGNC"));
             }
         }
         if (!empty($enzyme_ids)) {
             $enzyme_ids = explode(", ", $enzyme_ids);
             foreach ($enzyme_ids as $enzyme_id) {
                 parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "x-ec", utf8_encode(htmlspecialchars($enzyme_id))) . parent::describeProperty($this->getVoc() . "x-ec", "Enzyme Commission (EC) number", "Enzyme entries have Enzyme Commission (EC) numbers associated with them that indicate the hierarchical functional classes to which they belong"));
             }
         }
         if (!empty($entrez_gene_id)) {
             parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ncbigene", "ncbigene:{$entrez_gene_id}") . parent::describeProperty($this->getVoc() . "x-ncbigene", "NCBI Gene", "NCBI Gene provides curated sequence and descriptive information about genetic loci including official nomenclature, synonyms, sequence accessions, phenotypes, EC numbers, MIM numbers, UniGene clusters, homology, map locations, and related web sites"));
         }
         if (!empty($ensembl_gene_id)) {
             parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ensembl", "ensembl:{$ensembl_gene_id}") . parent::describeProperty($this->getVoc() . "x-ensembl", "Ensembl Gene"));
         }
         if (!empty($mouse_genome_database_id)) {
             if (strpos($mouse_genome_database_id, "MGI:") !== FALSE) {
                 $mouse_genome_database_id = substr($mouse_genome_database_id, 4);
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-mgi", "mgi:{$mouse_genome_database_id}") . parent::describeProperty($this->getVoc() . "x-mgi", "MGI entry"));
             }
         }
         if (!empty($specialist_database_links)) {
             $specialist_database_links = explode(", ", $specialist_database_links);
             foreach ($specialist_database_links as $specialist_database_link) {
                 preg_match('/href="(\\S+)"/', $specialist_database_link, $matches);
                 if (!empty($matches[1])) {
                     parent::AddRDF(parent::QQuadO_URL($id_res, $this->getVoc() . "xref", $matches[1]) . parent::describeProperty($this->getVoc() . "xref", "Specialist database references."));
                 }
             }
         }
         if (!empty($pubmed_ids)) {
             $pubmed_ids = explode(", ", $pubmed_ids);
             foreach ($pubmed_ids as $pubmed_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-pubmed", "pubmed:" . trim($pubmed_id)) . parent::describeProperty($this->getVoc() . "x-pubmed", "NCBI PubMed entry", "Identifier that links to published articles relevant to the entry in the NCBI's PubMed database."));
             }
         }
         if (!empty($refseq_ids)) {
             $refseq_ids = explode(", ", $refseq_ids);
             foreach ($refseq_ids as $refseq_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-refseq", "refseq:" . trim($refseq_id)) . parent::describeProperty($this->getVoc() . "x-refseq", "NCBI Refseq entry", "The Reference Sequence (RefSeq) identifier for that entry, provided by the NCBI. As we do not aim to curate all variants of a gene only one selected RefSeq is displayed per gene report. RefSeq aims to provide a comprehensive, integrated, non-redundant set of sequences, including genomic DNA, transcript (RNA), and protein products. RefSeq identifiers are designed to provide a stable reference for gene identification and characterization, mutation analysis, expression studies, polymorphism discovery, and comparative analyses. In the HTML results page this ID links to the RefSeq page for that entry."));
             }
         }
         if (!empty($gene_family_tag)) {
             parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "gene-family-tag", utf8_encode(htmlspecialchars($gene_family_tag))) . parent::describeProperty($this->getVoc() . "gene-family-tag", "Gene Family Tag", "Tag used to designate a gene family or group the gene has been assigned to, according to either sequence similarity or information from publications, specialist advisors for that family or other databases. Families/groups may be either structural or functional, therefore a gene may belong to more than one family/group. These tags are used to generate gene family or grouping specific pages at genenames.org and do not necessarily reflect an official nomenclature. Each gene family has an associated gene family tag and gene family description. If a particular gene is a member of more than one gene family, the tags and the descriptions will be shown in the same order."));
         }
         if (!empty($gene_family_description)) {
             $gene_family_description = str_replace("\"", "", $gene_family_description);
             parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "gene-family-description", utf8_encode(htmlspecialchars($gene_family_description))) . parent::describeProperty($this->getVoc() . "gene-family-description", "gene family name", "Name given to a particular gene family. The gene family description has an associated gene family tag. Gene families are used to group genes according to either sequence similarity or information from publications, specialist advisors for that family or other databases. Families/groups may be either structural or functional, therefore a gene may belong to more than one family/group."));
         }
         if (!empty($record_type)) {
             parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "record-type", utf8_encode(htmlspecialchars($record_type))));
         }
         if (!empty($primary_ids)) {
             $primary_ids = explode(", ", $primary_ids);
             foreach ($primary_ids as $primary_id) {
                 parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "primary-id", utf8_encode(htmlspecialchars($primary_id))) . parent::describeProperty($this->getVoc() . "primary-id", "primary identifier"));
             }
         }
         if (!empty($secondary_ids)) {
             $secondary_ids = explode(", ", $secondary_ids);
             foreach ($secondary_ids as $secondary_id) {
                 parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "secondary-id", utf8_encode(htmlspecialchars($secondary_id))) . parent::describeProperty($this->getVoc() . "secondary-id", "secondary identifier"));
             }
         }
         if (!empty($ccd_ids)) {
             $ccd_ids = explode(", ", $ccd_ids);
             foreach ($ccd_ids as $ccd_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ccds", "ccds:" . trim($ccd_id)) . parent::describeProperty($this->getVoc() . "x-ccds", "consensus CDS entry", "The Consensus CDS (CCDS) project is a collaborative effort to identify a core set of human and mouse protein coding regions that are consistently annotated and of high quality. The long term goal is to support convergence towards a standard set of gene annotations."));
             }
         }
         if (!empty($vega_ids)) {
             $vega_ids = explode(", ", $vega_ids);
             foreach ($vega_ids as $vega_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-vega", "vega:" . trim($vega_id)) . parent::describeProperty($this->getVoc() . "x-vega", "VEGA gene entry"));
             }
         }
         if (!empty($locus_specific_databases)) {
             parent::AddRDF(parent::triplifyString($id_res, $this->getVoc() . "locus-specific-xref", utf8_encode(htmlspecialchars($locus_specific_databases))) . parent::describeProperty($this->getVoc() . "locus-specific-xref", "locus specific xref", "This contains a list of links to databases or database entries pertinent to the gene"));
         }
         if (!empty($entrez_gene_id_mappeddatasuppliedbyNCBI)) {
             $entrez_gene_id_mappeddatasuppliedbyNCBI = explode(", ", $entrez_gene_id_mappeddatasuppliedbyNCBI);
             foreach ($entrez_gene_id_mappeddatasuppliedbyNCBI as $gene_id) {
                 if (strstr($gene_id, ":") !== FALSE) {
                     $a = explode(":", $gene_id);
                     $gene_id = $a[1];
                 }
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ncbigene", "ncbigene:" . trim($gene_id)) . parent::describeProperty($this->getVoc() . "x-ncbigene", "NCBI Gene entry"));
             }
         }
         if (!empty($omim_id_mappeddatasuppliedbyNCBI)) {
             $omim_id_mappeddatasuppliedbyNCBI = explode(", ", $omim_id_mappeddatasuppliedbyNCBI);
             foreach ($omim_id_mappeddatasuppliedbyNCBI as $omim_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-omim", "omim:" . trim($omim_id)) . parent::describeProperty($this->getVoc() . "x-omim", "OMIM entry", "Identifier provided by Online Mendelian Inheritance in Man (OMIM) at the NCBI. This database is described as a catalog of human genes and genetic disorders containing textual information and links to MEDLINE and sequence records in the Entrez system, and links to additional related resources at NCBI and elsewhere. In the HTML results page this ID links to the OMIM page for that entry."));
             }
         }
         if (!empty($refseq_mappeddatasuppliedbyNCBI)) {
             $refseq_mappeddatasuppliedbyNCBI = explode(", ", $refseq_mappeddatasuppliedbyNCBI);
             foreach ($refseq_mappeddatasuppliedbyNCBI as $refseq_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-refseq", "refseq:" . trim($refseq_id)) . parent::describeProperty($this->getVoc() . "x-refseq", "NCBI Refseq entry", "The Reference Sequence (RefSeq) identifier for that entry, provided by the NCBI. As we do not aim to curate all variants of a gene only one selected RefSeq is displayed per gene report. RefSeq aims to provide a comprehensive, integrated, non-redundant set of sequences, including genomic DNA, transcript (RNA), and protein products. RefSeq identifiers are designed to provide a stable reference for gene identification and characterization, mutation analysis, expression studies, polymorphism discovery, and comparative analyses. In the HTML results page this ID links to the RefSeq page for that entry."));
             }
         }
         if (!empty($uniprot_id_mappeddatasuppliedbyUniProt)) {
             $uniprot_id_mappeddatasuppliedbyUniProt = explode(", ", $uniprot_id_mappeddatasuppliedbyUniProt);
             foreach ($uniprot_id_mappeddatasuppliedbyUniProt as $uniprot_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-uniprot", "uniprot:" . trim($uniprot_id)) . parent::describeProperty($this->getVoc() . "x-uniprot", "Uniprot entry", "The UniProt identifier, provided by the EBI. The UniProt Protein Knowledgebase is described as a curated protein sequence database that provides a high level of annotation, a minimal level of redundancy and high level of integration with other databases. In the HTML results page this ID links to the UniProt page for that entry."));
             }
         }
         if (!empty($ensembl_id_mappeddatasuppliedbyEnsembl)) {
             $ensembl_id_mappeddatasuppliedbyEnsembl = explode(", ", $ensembl_id_mappeddatasuppliedbyEnsembl);
             foreach ($ensembl_id_mappeddatasuppliedbyEnsembl as $ensembl_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ensembl", "ensembl:" . trim($refseq_id)) . parent::describeProperty($this->getVoc() . "x-ensembl", "Ensembl entry", "The Ensembl ID is derived from the current build of the Ensembl database and provided by the Ensembl team."));
             }
         }
         if (!empty($ucsc_id_mappeddatasuppliedbyVega)) {
             $ucsc_id_mappeddatasuppliedbyVega = explode(", ", $ucsc_id_mappeddatasuppliedbyVega);
             foreach ($ucsc_id_mappeddatasuppliedbyVega as $vega_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-vega", "vega:" . trim($vega_id)) . parent::describeProperty($this->getVoc() . "x-vega", "Vega entry"));
             }
         }
         if (!empty($ucsc_id_mappeddatasuppliedbyUCSC)) {
             $ucsc_id_mappeddatasuppliedbyUCSC = explode(", ", $ucsc_id_mappeddatasuppliedbyUCSC);
             foreach ($ucsc_id_mappeddatasuppliedbyUCSC as $ucsc_id) {
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-ucsc", "ucsc:" . trim($ucsc_id)) . parent::describeProperty($this->getVoc() . "x-ucsc", "UCSC entry"));
             }
         }
         if (!empty($mouse_genome_database_id_mappeddatasuppliedbyMGI)) {
             $mouse_genome_database_id_mappeddatasuppliedbyMGI = explode(", ", $mouse_genome_database_id_mappeddatasuppliedbyMGI);
             foreach ($mouse_genome_database_id_mappeddatasuppliedbyMGI as $mgi_id) {
                 if (strpos($mgi_id, "MGI:") !== FALSE) {
                     $mgi_id = substr($mgi_id, 4);
                 }
                 parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-mgi", "mgi:" . trim($mgi_id)) . parent::describeProperty($this->getVoc() . "x-mgi", "MGI entry"));
             }
         }
         if (!empty($rat_genome_database_id_mappeddatasuppliedbyRGD)) {
             $rat_genome_database_id_mappeddatasuppliedbyRGD = explode(", ", trim($rat_genome_database_id_mappeddatasuppliedbyRGD));
             foreach ($rat_genome_database_id_mappeddatasuppliedbyRGD as $rgd_id) {
                 $rgd_id = trim($rgd_id);
                 if (!empty($rgd_id)) {
                     parent::AddRDF(parent::triplify($id_res, $this->getVoc() . "x-rgd", trim($rgd_id)) . parent::describeProperty($this->getVoc() . "x-rgd", "RGD entry"));
                 }
             }
         }
         //write RDF to file
         $this->WriteRDFBufferToWriteFile();
     }
     //while
 }
Beispiel #4
0
 function process()
 {
     $gb_record_str = "";
     while ($aLine = $this->getReadFile()->Read(4096)) {
         preg_match("/^\\/\\/\$/", $aLine, $matches);
         if (count($matches)) {
             //now remove the header if it is there
             $gb_record_str = $this->removeHeader($gb_record_str);
             $sectionsRaw = $this->parseGenbankRaw($gb_record_str);
             /**
              * SECTIONS being parsed:
              * locus, definition, accession, version, keywords, segment, source, reference, features
              */
             //get locus section(s)
             $locus = $this->retrieveSections("LOCUS", $sectionsRaw);
             $parsed_locus_arr = $this->parseLocus($locus);
             //get the definition section
             $definition = $this->retrieveSections("DEFINITION", $sectionsRaw);
             $parsed_definition_arr = $this->parseDefinition($definition);
             //get the accession
             $accessions = $this->retrieveSections("ACCESSION", $sectionsRaw);
             $parsed_accession_arr = $this->parseAccession($accessions);
             //get the version
             $versions = $this->retrieveSections("VERSION", $sectionsRaw);
             $parsed_version_arr = $this->parseVersion($versions);
             //get the keywords
             $keywords = $this->retrieveSections("KEYWORDS", $sectionsRaw);
             $parsed_keyword_arr = $this->parseKeywords($keywords);
             //may not be any segment section
             $segments = $this->retrieveSections("SEGMENT", $sectionsRaw);
             if (!empty($segments)) {
                 $parsed_segments_arr = $this->parseSegment($segments);
             }
             $features = $this->retrieveSections("FEATURES", $sectionsRaw);
             $parsed_features_arr = $this->parseFeatures($features);
             //get the source section
             $source = $this->retrieveSections("SOURCE", $sectionsRaw);
             $parsed_source_arr = $this->parseSource($source);
             $contig = $this->retrieveSections("CONTIG", $sectionsRaw);
             if (!empty($contig)) {
                 $parsed_contig_arr = $this->parseContig($contig);
             }
             //get the reference section
             $references = $this->retrieveSections("REFERENCE", $sectionsRaw);
             $parsed_refs_arr = $this->parseReferences($references);
             $gb_res = "gi:" . $parsed_version_arr['gi'];
             $gb_label = utf8_encode(htmlspecialchars($parsed_definition_arr[0]));
             parent::AddRDF(parent::describeIndividual($gb_res, $gb_label, $this->getVoc() . "genbank-record") . parent::triplifyString($gb_res, $this->getVoc() . 'sequence-length', $parsed_locus_arr[0]['sequence_length']) . parent::triplifyString($gb_res, $this->getVoc() . 'strandedness', $parsed_locus_arr[0]['strandedness']) . parent::triplify($gb_res, "rdf:type", $this->getRes() . $parsed_locus_arr[0]['mol_type']) . parent::triplifyString($gb_res, $this->getVoc() . 'chromosome-shape', $parsed_locus_arr[0]['chromosome_shape']) . parent::triplifyString($gb_res, $this->getVoc() . 'division-name', $parsed_locus_arr[0]['division_name']) . parent::triplifyString($gb_res, $this->getVoc() . 'date-of-entry', $parsed_locus_arr[0]['date']) . parent::triplifyString($gb_res, $this->getVoc() . 'source', utf8_encode($parsed_source_arr[0])) . parent::QQuadO_URL($gb_res, $this->getVoc() . 'fasta-seq', 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?sendto=on&db=nucest&dopt=fasta&val=' . $parsed_version_arr['gi']));
             foreach ($parsed_features_arr as $aFeature) {
                 //getFeatures
                 $type = $aFeature['type'];
                 $feat_desc = $this->getFeatures($type);
                 $label = preg_replace('/\\s\\s*/', ' ', $feat_desc['definition']);
                 $comment = null;
                 $value = $aFeature['value'];
                 $value_arr = explode("/", $value);
                 $location = preg_replace('/\\n/', '', $value_arr[0]);
                 $class_id = parent::getVoc() . md5($type);
                 $feat_res = parent::getRes() . md5($type . $location . $gb_res);
                 $feat_label = utf8_encode($type . " " . $location . " for " . $gb_res);
                 if (isset($feat_desc['comment'])) {
                     $comment = $feat_desc['comment'];
                     $comment = preg_replace('/\\s\\s*/', ' ', $comment);
                     $label .= " " . $comment;
                 }
                 parent::AddRDF(parent::describeClass($class_id, $label, parent::getVoc() . "Feature") . parent::describeIndividual($feat_res, $feat_label, $class_id) . parent::triplify($gb_res, $this->getVoc() . "has-feature", $feat_res));
                 foreach ($value_arr as $aL) {
                     //check if aL has an equals in it
                     $p = "/(\\S+)\\=(.*)/";
                     preg_match($p, $aL, $m);
                     if (count($m)) {
                         if ($m[1] == "db_xref") {
                             parent::AddRDF(parent::triplify($feat_res, "rdfs:seeAlso", str_replace("\"", "", $m[2])));
                         } else {
                             parent::AddRDF(parent::triplifyString($feat_res, $this->getVoc() . $m[1], utf8_encode(str_replace("\"", "", $m[2]))));
                         }
                     }
                 }
             }
             foreach ($parsed_accession_arr[0] as $acc) {
                 parent::AddRDF(parent::triplifyString($gb_res, $this->getVoc() . "accession", $acc));
             }
             if (isset($parsed_version_arr['versioned_accession'])) {
                 parent::AddRDF(parent::triplifyString($gb_res, $this->getVoc() . "versioned-accession", $parsed_version_arr['versioned_accession']));
             }
             if (isset($parsed_contig_arr)) {
                 foreach ($parsed_contig_arr as $aContig) {
                     parent::AddRDF(parent::triplifyString($gb_res, $this->getVoc() . "contig", parent::safeLiteral($aContig)));
                 }
             }
             foreach ($parsed_keyword_arr as $akw) {
                 parent::AddRDF(parent::triplifyString($gb_res, $this->getVoc() . "keyword", $akw));
             }
             if (isset($parsed_segments_arr)) {
                 foreach ($parsed_segments_arr as $aSeg) {
                     parent::AddRDF(parent::triplifyString($gb_res, $this->getVoc() . "segment-number", $aSeg['segment_number']) . parent::triplifyString($gb_res, $this->getVoc() . "total-segments", $aSeg['total_segments']));
                 }
             }
             foreach ($parsed_refs_arr as $aRef) {
                 $r = rand();
                 $ref_res = $this->getRes() . md5($r);
                 $ref_label = "reference for " . $gb_res;
                 if (isset($aRef['TITLE'])) {
                     parent::AddRDF(parent::describeIndividual($ref_res, $ref_label, $this->getVoc() . "reference") . parent::triplifyString($ref_res, $this->getVoc() . "title", $aRef['TITLE']));
                 }
                 if (isset($aRef['PUBMED'])) {
                     parent::AddRDF(parent::triplify($ref_res, $this->getVoc() . "x-pubmed", 'pubmed:' . $aRef['PUBMED']));
                 }
                 if (isset($aRef['AUTHORS'])) {
                     parent::AddRDF(parent::triplifyString($ref_res, $this->getVoc() . "authors", $aRef['AUTHORS']));
                 }
                 parent::AddRDF(parent::triplify($gb_res, $this->getVoc() . "reference", $ref_res) . parent::triplifyString($ref_res, $this->getVoc() . "coordinates", $aRef['COORDINATES']) . parent::triplifyString($ref_res, $this->getVoc() . "citation", $aRef['JOURNAL']));
             }
             $gb_record_str = "";
             $this->WriteRDFBufferToWriteFile();
             continue;
         }
         preg_match("/^\n\$/", $aLine, $matches);
         if (count($matches) == 0) {
             $gb_record_str .= $aLine;
         }
     }
     //while
 }