Example #1
0
function parser_ipi_OSCODE_xref_gz_file($anInputPath, $anOSCODEFile, $outputPath)
{
    //check if inputpath has a trailing slash
    if (strrpos($anInputPath, "/") != count($anInputPath)) {
        //no trailing slash
        $anInputPath .= "/";
    }
    if (strrpos($outputPath, "/") != count($outputPath)) {
        //no trailing slash
        $outputPath .= "/";
    }
    echo "Processing {$anInputPath}, {$anOSCODEFile}, {$outputPath} " . PHP_EOL;
    $out_fileName = substr($anOSCODEFile, 0, strrpos($anOSCODEFile, "."));
    $ifh = gzopen($anInputPath . $anOSCODEFile, 'r') or die("Could not open " . $anInputPath . $anOSCODEFile . "\n");
    $outfh = fopen($outputPath . $out_fileName . ".nt", 'w') or die("Could not open file here!\n");
    if ($ifh) {
        while (!gzeof($ifh)) {
            $aLine = gzgets($ifh, 4096);
            $tLine = explode("\t", $aLine);
            //see http://www.ebi.ac.uk/IPI/xrefs.html
            //get the Database from which master entry of this IPI entry has been taken.
            $master_db = null;
            //key is code, value is bio2rdf namespace
            if ($tLine[0] == "SP" || $tLine[0] == "REFSEQ_REVIEWED" || $tLine[0] == "TR" || $tLine[0] == "ENSEMBL" || $tLine[0] == "ENSEMBL_HAVANA" || $tLine[0] == "REFSEQ_STATUS" || $tLine[0] == "VEGA" || $tLine[0] == "TAIR" || $tLine[0] == "HINV") {
                if ($tLine[0] == "SP") {
                    $master_db["SP"] = "swissprot";
                }
                if ($tLine[0] == "TR") {
                    $master_db["TR"] = "uniprot";
                }
                if ($tLine[0] == "ENSEMBL") {
                    $master_db["ENSEMBL"] = "ensembl";
                }
                if ($tLine[0] == "ENSEMBL_HAVANA") {
                    $master_db["ENSEMBL_HAVANA"] = "ensembl";
                }
                if ($tLine[0] == "REFSEQ_STATUS") {
                    $master_db["REFSEQ_STATUS"] = "refseq";
                }
                if ($tLine[0] == "VEGA") {
                    $master_db["VEGA"] = "vega";
                }
                if ($tLine[0] == "TAIR") {
                    $master_db["TAIR"] = "tair";
                }
                if ($tLine[0] == "HINV") {
                    $master_db["HINV"] = "hinv";
                }
            }
            $ipi_id = null;
            $sup_uniprots_sps = array();
            $uniprotkb_id = null;
            $sup_uniprots_tre = array();
            $sup_ensembl = array();
            $sup_refseq = array();
            $sup_tair = array();
            $sup_hinv = array();
            $xref_embl_genbank_ddbj = array();
            $hgnc_ids = array();
            $ncbi_ids = array();
            $uniparc_ids = array();
            $unigene_ids = array();
            $ccds_ids = array();
            $refseq_gis = array();
            $vega_ids = array();
            //UniProtKB accession number or Vega ID or Ensembl ID or RefSeq ID or TAIR Protein ID or H-InvDB ID
            if (count(isset($tLine[1]))) {
                @($uniprotkb_id = getFirstId($tLine[1]));
            }
            //ipi id
            if (count(isset($tLine[2]))) {
                @($ipi_id = $tLine[2]);
            }
            //Supplementary UniProtKB/Swiss-Prot entries associated with this IPI entry.
            if (count(isset($tLine[3]))) {
                @($sup_uniprots_sps = readIdentifiers($tLine[3]));
            }
            //Supplementary UniProtKB/TrEMBL entries associated with this IPI entry.
            if (count(isset($tLine[4]))) {
                @($sup_uniprots_tre = readIdentifiers($tLine[4]));
            }
            //Supplementary Ensembl entries associated with this IPI entry. Havana curated transcripts preceeded by the key HAVANA: (e.g. HAVANA:ENSP00000237305;ENSP00000356824;).
            if (count(isset($tLine[5]))) {
                @($sup_ensembl = readIdentifiers($tLine[5]));
            }
            //Supplementary list of RefSeq STATUS:ID couples (separated by a semi-colon ';') associated with this IPI entry (RefSeq entry revision status details).
            if (count(isset($tLine[6]))) {
                @($sup_refseq = readIdentifiers($tLine[6]));
            }
            //Supplementary TAIR Protein entries associated with this IPI entry.
            if (count(isset($tLine[7]))) {
                @($sup_tair = readIdentifiers($tLine[7]));
            }
            //Supplementary H-Inv Protein entries associated with this IPI entry.
            if (count(isset($tLine[8]))) {
                @($sup_hinv = readIdentifiers($tLine[8]));
            }
            //Protein identifiers (cross reference to EMBL/Genbank/DDBJ nucleotide databases).
            if (count(isset($tLine[9]))) {
                @($xref_embl_genbank_ddbj = readIdentifiers($tLine[9]));
            }
            //List of HGNC number, HGNC official gene symbol couples (separated by by a semi-colon ';') associated with this IPI entry.
            if (count(isset($tLine[10]))) {
                @($hgnc_ids = readIdentifiers($tLine[10]));
            }
            ////List of NCBI Entrez Gene gene number, Entrez Gene Default Gene Symbol couples (separated by a semi-colon ';') associated with this IPI entry.
            if (count(isset($tLine[11]))) {
                @($ncbi_ids = readIdentifiers($tLine[11]));
            }
            //UNIPARC identifier associated with the sequence of this IPI entry.
            if (count(isset($tLine[12]))) {
                @($uniparc_ids = readIdentifiers($tLine[12]));
            }
            //UniGene identifiers associated with this IPI entry.
            if (count(isset($tLine[13]))) {
                @($unigene_ids = readIdentifiers($tLine[13]));
            }
            //CCDS identifiers associated with this IPI entry.
            if (count(isset($tLine[14]))) {
                @($ccds_ids = readIdentifiers($tLine[14]));
            }
            //RefSeq GI protein identifiers associated with this IPI entry.
            if (count(isset($tLine[15]))) {
                @($refseq_gis = readIdentifiers($tLine[15]));
            }
            //Supplementary Vega entries associated with this IPI entry.
            if (count(isset($tLine[16]))) {
                @($vega_ids = readIdentifiers($tLine[16]));
            }
            //now lets print some rdf
            $entryURI = "http://bio2rdf.org/ipi:" . $ipi_id;
            $buf = "";
            if (count($sup_refseq)) {
                foreach ($sup_refseq as $r) {
                    if ($r != "" && $r != "\n") {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_refseq_id> <http://bio2rdf.org/refseq:" . $r . "> .\n";
                    }
                }
            }
            if ($uniprotkb_id != "" && $uniprotkb_id != "\n" && count($uniprotkb_id) > 1 && isset($uniprotkb_id)) {
                $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_uniprot_id> <http://bio2rdf.org/uniprot:" . $r . "> .\n";
            }
            if (count($sup_uniprots_sps)) {
                foreach ($sup_uniprots_sps as $r) {
                    if ($r != "" && $r != "\n" && count($r) > 1 && isset($r)) {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_uniprot_id> <http://bio2rdf.org/uniprot:" . $r . "> .\n";
                    }
                }
            }
            if (count($sup_uniprots_tre)) {
                foreach ($sup_uniprots_tre as $r) {
                    if ($r != "" && $r != "\n" && count($r) > 1 && isset($r)) {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_uniprot_id> <http://bio2rdf.org/uniprot:" . $r . "> .\n";
                    }
                }
            }
            if (count($sup_ensembl)) {
                foreach ($sup_ensembl as $r) {
                    if ($r != "" && $r != "\n") {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_ensembl_id> <http://bio2rdf.org/ensembl:" . $r . "> .\n";
                    }
                }
            }
            if (count($sup_tair)) {
                foreach ($sup_tair as $r) {
                    if ($r != "" && $r != "\n") {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_tair_id> <http://bio2rdf.org/tair:" . $r . "> .\n";
                    }
                }
            }
            if (count($sup_hinv)) {
                foreach ($sup_hinv as $r) {
                    if ($r != "" && $r != "\n") {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_hinv_id> <http://bio2rdf.org/hinv:" . $r . "> .\n";
                    }
                }
            }
            if (count($xref_embl_genbank_ddbj)) {
                foreach ($xref_embl_genbank_ddbj as $r) {
                    if ($r != "" && $r != "\n") {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_embl_id> <http://bio2rdf.org/embl:" . $r . "> .\n";
                    }
                }
            }
            if (count($hgnc_ids)) {
                foreach ($hgnc_ids as $r) {
                    if ($r != "" && $r != "\n") {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_hgnc_id> <http://bio2rdf.org/hgnc:" . $r . "> .\n";
                    }
                }
            }
            if (count($ncbi_ids)) {
                foreach ($ncbi_ids as $r) {
                    if ($r != "" && $r != "\n") {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_gene_id> <http://bio2rdf.org/gene:" . $r . "> .\n";
                    }
                }
            }
            if (count($uniparc_ids)) {
                foreach ($uniparc_ids as $r) {
                    if ($r != "" && $r != "\n") {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_uniparc_id> <http://bio2rdf.org/uniparc:" . $r . "> .\n";
                    }
                }
            }
            if (count($unigene_ids)) {
                foreach ($unigene_ids as $r) {
                    if ($r != "" && $r != "\n") {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_unigene_id> <http://bio2rdf.org/unigene:" . $r . "> .\n";
                    }
                }
            }
            if (count($ccds_ids)) {
                foreach ($ccds_ids as $r) {
                    if ($r != "" && $r != "\n") {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_ccds_id> <http://bio2rdf.org/ccds:" . $r . "> .\n";
                    }
                }
            }
            if (count($refseq_gis)) {
                foreach ($refseq_gis as $r) {
                    if ($r != "" && $r != "\n") {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_gene_id> <http://bio2rdf.org/gene:" . $r . "> .\n";
                    }
                }
            }
            if (count($vega_ids)) {
                foreach ($vega_ids as $r) {
                    if ($r != "" && $r != "\n") {
                        $buf .= "<{$entryURI}> <http://bio2rdf.org/ipi_vocabulary:has_vega_id> <http://bio2rdf.org/vega:" . $r . "> .\n";
                    }
                }
            }
            fwrite($outfh, $buf);
        }
        //while
    }
    //if
    if (!feof($ifh)) {
        echo "Error: unexpected gzgets() fail!\n";
    }
    gzclose($ifh);
    fclose($outfh);
}
Example #2
0
 private function gene_xrefs()
 {
     while ($aLine = $this->getReadFile()->Read(4096)) {
         $tLine = explode("\t", $aLine);
         if (!$this->startsWith($tLine[0], "#")) {
             $chromosome = null;
             $cosmid = array();
             $start_coord = null;
             $gene_symbol = null;
             $end_coord = null;
             $strand = array();
             $gene_location = array();
             $ensembl_id = array();
             $gene_id = null;
             $ipi_ids = array();
             $uniprotkb_ids = array();
             $uniprot_tre = array();
             $ensembl_peptide_id = array();
             $refseq_ids = array();
             $tair_ids = array();
             $hinv_ids = array();
             $unigene_ids = array();
             $ccds_ids = array();
             $refseq_gis = array();
             $vega_genes = array();
             $vega_peptides = array();
             if (count(isset($tLine[0]))) {
                 @($chr_arr = readIdentifiers($tLine[0]));
                 if ($chr_arr[0] != "Un") {
                     $chromosome = $chr_arr[0];
                 }
             }
             if (count(isset($tLine[1]))) {
                 @($cosmid = readIdentifiers($tLine[1]));
             }
             if (count(isset($tLine[2]))) {
                 @($start_coord_t = readIdentifiers($tLine[2]));
                 if (count($start_coord_t) == 1) {
                     $start_coord = $start_coord_t[0];
                 }
             }
             if (count(isset($tLine[3]))) {
                 @($end_coord_t = readIdentifiers($tLine[3]));
                 if (count($end_coord_t) == 1) {
                     $end_coord = $end_coord_t[0];
                 }
             }
             if (count(isset($tLine[4]))) {
                 @($strand = readIdentifiers($tLine[4]));
             }
             if (count(isset($tLine[5]))) {
                 @($gene_location = readIdentifiers($tLine[5]));
             }
             if (count(isset($tLine[6]))) {
                 @($ensembl_id = readIdentifiers($tLine[6]));
             }
             if (count(isset($tLine[8]))) {
                 @($gene_id_t = readIdentifiers($tLine[8]));
                 if (count($gene_id_t) == 2) {
                     $gene_id = $gene_id_t[0];
                     $gene_symbol = $gene_id_t[1];
                 }
             }
             if (count(isset($tLine[9]))) {
                 @($ipi_ids = readIdentifiers($tLine[9]));
             }
             if (count(isset($tLine[10]))) {
                 @($uniprotkb_ids = readIdentifiers($tLine[10]));
             }
             if (count(isset($tLine[11]))) {
                 @($uniprot_tre = readIdentifiers($tLine[11]));
             }
             if (count(isset($tLine[12]))) {
                 @($ensembl_peptide_id = readIdentifiers($tLine[12]));
             }
             if (count(isset($tLine[13]))) {
                 @($refseq_ids = readIdentifiers($tLine[13]));
             }
             if (count(isset($tLine[14]))) {
                 @($tair_ids = readIdentifiers($tLine[14]));
             }
             if (count(isset($tLine[15]))) {
                 @($hinv_ids = readIdentifiers($tLine[15]));
             }
             if (count(isset($tLine[16]))) {
                 @($unigene_ids = readIdentifiers($tLine[16]));
             }
             if (count(isset($tLine[17]))) {
                 @($ccds_ids = readIdentifiers($tLine[17]));
             }
             if (count(isset($tLine[18]))) {
                 @($refseq_gis = readIdentifiers($tLine[18]));
             }
             if (count(isset($tLine[19]))) {
                 @($vega_genes = readIdentifiers($tLine[19]));
             }
             if (count(isset($tLine[20]))) {
                 @($refseq_ids = readIdentifiers($tLine[20]));
             }
             //lets make some RDF
             if (count($gene_id)) {
                 $res = "gene:" . $gene_id;
                 parent::AddRDF(parent::triplifyString($res, $this->getVoc() . "gene-symbol", $gene_symbol) . parent::triplifyString($res, $this->getVoc() . "chromosome", $chromosome) . parent::triplifyString($res, $this->getVoc() . "start-coordinate", $start_coord) . parent::triplifyString($res, $this->getVoc() . "end-coordinate", $end_coord));
             }
             if (count($strand)) {
                 parent::AddRDF(parent::triplifyString($res, $this->getVoc() . "strand", $strand[0]));
             }
             if (count($ensembl_id)) {
                 foreach ($ensembl_id as $x) {
                     parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-ensembl", "ensembl:" . $x));
                 }
             }
             if (count($gene_location)) {
                 foreach ($gene_location as $x) {
                     parent::AddRDF(parent::triplifyString($res, $this->getVoc() . "gene-location", $x));
                 }
             }
             if (count($ipi_ids)) {
                 foreach ($ipi_ids as $x) {
                     parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-ipi", "ipi:" . $x));
                 }
             }
             if (count($uniprotkb_ids)) {
                 foreach ($uniprotkb_ids as $x) {
                     parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-uniprot", "uniprot:" . $x));
                 }
             }
             if (count($uniprotkb_tre)) {
                 foreach ($uniprotkb_tre as $x) {
                     parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-uniprot", "uniprot:" . $x));
                 }
             }
             if (count($ensembl_peptide_id)) {
                 foreach ($ensembl_peptide_id as $x) {
                     parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-ensembl", "ensembl:" . $x));
                 }
             }
             if (count($refseq_ids)) {
                 foreach ($refseq_ids as $x) {
                     parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-refseq", "refseq:" . $x));
                 }
             }
             if (count($tair_ids)) {
                 foreach ($tair_ids as $x) {
                     parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-tair", "tair:" . $x));
                 }
             }
             if (count($hinv_ids)) {
                 foreach ($hinv_ids as $x) {
                     parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-hinv", "hinv:" . $x));
                 }
             }
             if (count($unigene_ids)) {
                 foreach ($unigene_ids as $x) {
                     parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-unigene", "unigene:" . $x));
                 }
             }
             if (count($refseq_gis)) {
                 foreach ($refseq_gis as $x) {
                     if (count($x) != 0 && $x != "\n" && $x != "") {
                         parent::AddRDF(parent::triplify($res, $this->getVoc() . "x-refseq", "refseq:" . $x));
                     }
                 }
             }
             $this->WriteRDFBufferToWriteFile();
         }
     }
 }