function parser_ipi_OSCODE_xref_gz_file($anInputPath, $anOSCODEFile, $outputPath) { //check if inputpath has a trailing slash if (strrpos($anInputPath, "/") != count($anInputPath)) { //no trailing slash $anInputPath .= "/"; } if (strrpos($outputPath, "/") != count($outputPath)) { //no trailing slash $outputPath .= "/"; } echo "Processing {$anInputPath}, {$anOSCODEFile}, {$outputPath} " . PHP_EOL; $out_fileName = substr($anOSCODEFile, 0, strrpos($anOSCODEFile, ".")); $ifh = gzopen($anInputPath . $anOSCODEFile, 'r') or die("Could not open " . $anInputPath . $anOSCODEFile . "\n"); $outfh = fopen($outputPath . $out_fileName . ".nt", 'w') or die("Could not open file here!\n"); if ($ifh) { while (!gzeof($ifh)) { $aLine = gzgets($ifh, 4096); $tLine = explode("\t", $aLine); //see http://www.ebi.ac.uk/IPI/xrefs.html //get the Database from which master entry of this IPI entry has been taken. $master_db = null; //key is code, value is bio2rdf namespace if ($tLine[0] == "SP" || $tLine[0] == "REFSEQ_REVIEWED" || $tLine[0] == "TR" || $tLine[0] == "ENSEMBL" || $tLine[0] == "ENSEMBL_HAVANA" || $tLine[0] == "REFSEQ_STATUS" || $tLine[0] == "VEGA" || $tLine[0] == "TAIR" || $tLine[0] == "HINV") { if ($tLine[0] == "SP") { $master_db["SP"] = "swissprot"; } if ($tLine[0] == "TR") { $master_db["TR"] = "uniprot"; } if ($tLine[0] == "ENSEMBL") { $master_db["ENSEMBL"] = "ensembl"; } if ($tLine[0] == "ENSEMBL_HAVANA") { $master_db["ENSEMBL_HAVANA"] = "ensembl"; } if ($tLine[0] == "REFSEQ_STATUS") { $master_db["REFSEQ_STATUS"] = "refseq"; } if ($tLine[0] == "VEGA") { $master_db["VEGA"] = "vega"; } if ($tLine[0] == "TAIR") { $master_db["TAIR"] = "tair"; } if ($tLine[0] == "HINV") { $master_db["HINV"] = "hinv"; } } $ipi_id = null; $sup_uniprots_sps = array(); $uniprotkb_id = null; $sup_uniprots_tre = array(); $sup_ensembl = array(); $sup_refseq = array(); $sup_tair = array(); $sup_hinv = array(); $xref_embl_genbank_ddbj = array(); $hgnc_ids = array(); $ncbi_ids = array(); $uniparc_ids = array(); $unigene_ids = array(); $ccds_ids = array(); $refseq_gis = array(); $vega_ids = array(); //UniProtKB accession number or Vega ID or Ensembl ID or RefSeq ID or TAIR Protein ID or H-InvDB ID if (count(isset($tLine[1]))) { @($uniprotkb_id = getFirstId($tLine[1])); } //ipi id if (count(isset($tLine[2]))) { @($ipi_id = $tLine[2]); } //Supplementary UniProtKB/Swiss-Prot entries associated with this IPI entry. if (count(isset($tLine[3]))) { @($sup_uniprots_sps = readIdentifiers($tLine[3])); } //Supplementary UniProtKB/TrEMBL entries associated with this IPI entry. if (count(isset($tLine[4]))) { @($sup_uniprots_tre = readIdentifiers($tLine[4])); } //Supplementary Ensembl entries associated with this IPI entry. Havana curated transcripts preceeded by the key HAVANA: (e.g. HAVANA:ENSP00000237305;ENSP00000356824;). if (count(isset($tLine[5]))) { @($sup_ensembl = readIdentifiers($tLine[5])); } //Supplementary list of RefSeq STATUS:ID couples (separated by a semi-colon ';') associated with this IPI entry (RefSeq entry revision status details). if (count(isset($tLine[6]))) { @($sup_refseq = readIdentifiers($tLine[6])); } //Supplementary TAIR Protein entries associated with this IPI entry. if (count(isset($tLine[7]))) { @($sup_tair = readIdentifiers($tLine[7])); } //Supplementary H-Inv Protein entries associated with this IPI entry. if (count(isset($tLine[8]))) { @($sup_hinv = readIdentifiers($tLine[8])); } //Protein identifiers (cross reference to EMBL/Genbank/DDBJ nucleotide databases). if (count(isset($tLine[9]))) { @($xref_embl_genbank_ddbj = readIdentifiers($tLine[9])); } //List of HGNC number, HGNC official gene symbol couples (separated by by a semi-colon ';') associated with this IPI entry. if (count(isset($tLine[10]))) { @($hgnc_ids = readIdentifiers($tLine[10])); } ////List of NCBI Entrez Gene gene number, Entrez Gene Default Gene Symbol couples (separated by a semi-colon ';') associated with this IPI entry. if (count(isset($tLine[11]))) { @($ncbi_ids = readIdentifiers($tLine[11])); } //UNIPARC identifier associated with the sequence of this IPI entry. if (count(isset($tLine[12]))) { @($uniparc_ids = readIdentifiers($tLine[12])); } //UniGene identifiers associated with this IPI entry. if (count(isset($tLine[13]))) { @($unigene_ids = readIdentifiers($tLine[13])); } //CCDS identifiers associated with this IPI entry. if (count(isset($tLine[14]))) { @($ccds_ids = readIdentifiers($tLine[14])); } //RefSeq GI protein identifiers associated with this IPI entry. if (count(isset($tLine[15]))) { @($refseq_gis = readIdentifiers($tLine[15])); } //Supplementary Vega entries associated with this IPI entry. if (count(isset($tLine[16]))) { @($vega_ids = readIdentifiers($tLine[16])); } //now lets print some rdf $entryURI = "http://bio2rdf.org/ipi:" . $ipi_id; $buf = ""; if (count($sup_refseq)) { foreach ($sup_refseq as $r) { if ($r != "" && $r != "\n") { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_refseq_id> <http://bio2rdf.org/refseq:" . $r . "> .\n"; } } } if ($uniprotkb_id != "" && $uniprotkb_id != "\n" && count($uniprotkb_id) > 1 && isset($uniprotkb_id)) { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_uniprot_id> <http://bio2rdf.org/uniprot:" . $r . "> .\n"; } if (count($sup_uniprots_sps)) { foreach ($sup_uniprots_sps as $r) { if ($r != "" && $r != "\n" && count($r) > 1 && isset($r)) { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_uniprot_id> <http://bio2rdf.org/uniprot:" . $r . "> .\n"; } } } if (count($sup_uniprots_tre)) { foreach ($sup_uniprots_tre as $r) { if ($r != "" && $r != "\n" && count($r) > 1 && isset($r)) { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_uniprot_id> <http://bio2rdf.org/uniprot:" . $r . "> .\n"; } } } if (count($sup_ensembl)) { foreach ($sup_ensembl as $r) { if ($r != "" && $r != "\n") { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_ensembl_id> <http://bio2rdf.org/ensembl:" . $r . "> .\n"; } } } if (count($sup_tair)) { foreach ($sup_tair as $r) { if ($r != "" && $r != "\n") { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_tair_id> <http://bio2rdf.org/tair:" . $r . "> .\n"; } } } if (count($sup_hinv)) { foreach ($sup_hinv as $r) { if ($r != "" && $r != "\n") { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_hinv_id> <http://bio2rdf.org/hinv:" . $r . "> .\n"; } } } if (count($xref_embl_genbank_ddbj)) { foreach ($xref_embl_genbank_ddbj as $r) { if ($r != "" && $r != "\n") { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_embl_id> <http://bio2rdf.org/embl:" . $r . "> .\n"; } } } if (count($hgnc_ids)) { foreach ($hgnc_ids as $r) { if ($r != "" && $r != "\n") { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_hgnc_id> <http://bio2rdf.org/hgnc:" . $r . "> .\n"; } } } if (count($ncbi_ids)) { foreach ($ncbi_ids as $r) { if ($r != "" && $r != "\n") { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_gene_id> <http://bio2rdf.org/gene:" . $r . "> .\n"; } } } if (count($uniparc_ids)) { foreach ($uniparc_ids as $r) { if ($r != "" && $r != "\n") { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_uniparc_id> <http://bio2rdf.org/uniparc:" . $r . "> .\n"; } } } if (count($unigene_ids)) { foreach ($unigene_ids as $r) { if ($r != "" && $r != "\n") { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_unigene_id> <http://bio2rdf.org/unigene:" . $r . "> .\n"; } } } if (count($ccds_ids)) { foreach ($ccds_ids as $r) { if ($r != "" && $r != "\n") { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_ccds_id> <http://bio2rdf.org/ccds:" . $r . "> .\n"; } } } if (count($refseq_gis)) { foreach ($refseq_gis as $r) { if ($r != "" && $r != "\n") { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_gene_id> <http://bio2rdf.org/gene:" . $r . "> .\n"; } } } if (count($vega_ids)) { foreach ($vega_ids as $r) { if ($r != "" && $r != "\n") { $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_vega_id> <http://bio2rdf.org/vega:" . $r . "> .\n"; } } } fwrite($outfh, $buf); } //while } //if if (!feof($ifh)) { echo "Error: unexpected gzgets() fail!\n"; } gzclose($ifh); fclose($outfh); }
private function gene_xrefs() { while ($aLine = $this->getReadFile()->Read(4096)) { $tLine = explode("\t", $aLine); if (!$this->startsWith($tLine[0], "#")) { $chromosome = null; $cosmid = array(); $start_coord = null; $gene_symbol = null; $end_coord = null; $strand = array(); $gene_location = array(); $ensembl_id = array(); $gene_id = null; $ipi_ids = array(); $uniprotkb_ids = array(); $uniprot_tre = array(); $ensembl_peptide_id = array(); $refseq_ids = array(); $tair_ids = array(); $hinv_ids = array(); $unigene_ids = array(); $ccds_ids = array(); $refseq_gis = array(); $vega_genes = array(); $vega_peptides = array(); if (count(isset($tLine[0]))) { @($chr_arr = readIdentifiers($tLine[0])); if ($chr_arr[0] != "Un") { $chromosome = $chr_arr[0]; } } if (count(isset($tLine[1]))) { @($cosmid = readIdentifiers($tLine[1])); } if (count(isset($tLine[2]))) { @($start_coord_t = readIdentifiers($tLine[2])); if (count($start_coord_t) == 1) { $start_coord = $start_coord_t[0]; } } if (count(isset($tLine[3]))) { @($end_coord_t = readIdentifiers($tLine[3])); if (count($end_coord_t) == 1) { $end_coord = $end_coord_t[0]; } } if (count(isset($tLine[4]))) { @($strand = readIdentifiers($tLine[4])); } if (count(isset($tLine[5]))) { @($gene_location = readIdentifiers($tLine[5])); } if (count(isset($tLine[6]))) { @($ensembl_id = readIdentifiers($tLine[6])); } if (count(isset($tLine[8]))) { @($gene_id_t = readIdentifiers($tLine[8])); if (count($gene_id_t) == 2) { $gene_id = $gene_id_t[0]; $gene_symbol = $gene_id_t[1]; } } if (count(isset($tLine[9]))) { @($ipi_ids = readIdentifiers($tLine[9])); } if (count(isset($tLine[10]))) { @($uniprotkb_ids = readIdentifiers($tLine[10])); } if (count(isset($tLine[11]))) { @($uniprot_tre = readIdentifiers($tLine[11])); } if (count(isset($tLine[12]))) { @($ensembl_peptide_id = readIdentifiers($tLine[12])); } if (count(isset($tLine[13]))) { @($refseq_ids = readIdentifiers($tLine[13])); } if (count(isset($tLine[14]))) { @($tair_ids = readIdentifiers($tLine[14])); } if (count(isset($tLine[15]))) { @($hinv_ids = readIdentifiers($tLine[15])); } if (count(isset($tLine[16]))) { @($unigene_ids = readIdentifiers($tLine[16])); } if (count(isset($tLine[17]))) { @($ccds_ids = readIdentifiers($tLine[17])); } if (count(isset($tLine[18]))) { @($refseq_gis = readIdentifiers($tLine[18])); } if (count(isset($tLine[19]))) { @($vega_genes = readIdentifiers($tLine[19])); } if (count(isset($tLine[20]))) { @($refseq_ids = readIdentifiers($tLine[20])); } //lets make some RDF if (count($gene_id)) { $res = "gene:" . $gene_id; parent::AddRDF(parent::triplifyString($res, $this->getVoc() . "gene-symbol", $gene_symbol) . parent::triplifyString($res, $this->getVoc() . "chromosome", $chromosome) . parent::triplifyString($res, $this->getVoc() . "start-coordinate", $start_coord) . parent::triplifyString($res, $this->getVoc() . "end-coordinate", $end_coord)); } if (count($strand)) { parent::AddRDF(parent::triplifyString($res, $this->getVoc() . "strand", $strand[0])); } if (count($ensembl_id)) { foreach ($ensembl_id as $x) { parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-ensembl", "ensembl:" . $x)); } } if (count($gene_location)) { foreach ($gene_location as $x) { parent::AddRDF(parent::triplifyString($res, $this->getVoc() . "gene-location", $x)); } } if (count($ipi_ids)) { foreach ($ipi_ids as $x) { parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-ipi", "ipi:" . $x)); } } if (count($uniprotkb_ids)) { foreach ($uniprotkb_ids as $x) { parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-uniprot", "uniprot:" . $x)); } } if (count($uniprotkb_tre)) { foreach ($uniprotkb_tre as $x) { parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-uniprot", "uniprot:" . $x)); } } if (count($ensembl_peptide_id)) { foreach ($ensembl_peptide_id as $x) { parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-ensembl", "ensembl:" . $x)); } } if (count($refseq_ids)) { foreach ($refseq_ids as $x) { parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-refseq", "refseq:" . $x)); } } if (count($tair_ids)) { foreach ($tair_ids as $x) { parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-tair", "tair:" . $x)); } } if (count($hinv_ids)) { foreach ($hinv_ids as $x) { parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-hinv", "hinv:" . $x)); } } if (count($unigene_ids)) { foreach ($unigene_ids as $x) { parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-unigene", "unigene:" . $x)); } } if (count($refseq_gis)) { foreach ($refseq_gis as $x) { if (count($x) != 0 && $x != "\n" && $x != "") { parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-refseq", "refseq:" . $x)); } } } $this->WriteRDFBufferToWriteFile(); } } }