function fetch_sequence($id) { $genbank_sequence = null; // Query URL $url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=' . $id . '&rettype=gb&retmode=xml'; $xml = get($url); //echo $xml; if ($xml != '') { $xp = new XsltProcessor(); $xsl = new DomDocument(); $xsl->load('xml2json.xslt'); $xp->importStylesheet($xsl); $dom = new DOMDocument(); $dom->loadXML($xml); $xpath = new DOMXPath($dom); $json = $xp->transformToXML($dom); //echo $json; // fix "-" in variable names $json = str_replace('"GBSeq_feature-table"', '"GBSeq_feature_table"', $json); $json = str_replace('"GBSeq_primary-accession"', '"GBSeq_primary_accession"', $json); $json = str_replace('"GBSeq_other-seqids"', '"GBSeq_other_seqids"', $json); $json = str_replace('"GBSeq_update-date"', '"GBSeq_update_date"', $json); $json = str_replace('"GBSeq_create-date"', '"GBSeq_create_date"', $json); $json = str_replace('"GBSeq_accession-version"', '"GBSeq_accession_version"', $json); $sequences = json_decode($json); //print_r($sequences); foreach ($sequences->GBSet as $GBSet) { $genbank_sequence = new stdclass(); $genbank_sequence->accession = $GBSet->GBSeq_primary_accession; $genbank_sequence->accession_version = $GBSet->GBSeq_accession_version; $genbank_sequence->organism = $GBSet->GBSeq_organism; $genbank_sequence->definition = $GBSet->GBSeq_definition; $genbank_sequence->moltype = $GBSet->GBSeq_moltype; // dates if (false != strtotime($GBSet->GBSeq_update_date)) { $genbank_sequence->updated = date("Y-m-d", strtotime($GBSet->GBSeq_update_date)); } if (false != strtotime($GBSet->GBSeq_create_date)) { $genbank_sequence->created = date("Y-m-d", strtotime($GBSet->GBSeq_create_date)); } // keywords $genbank_sequence->barcode = false; if ($GBSet->GBSeq_keywords) { foreach ($GBSet->GBSeq_keywords as $keyword) { if ($keyword = 'BARCODE') { $genbank_sequence->barcode = true; } } } foreach ($GBSet->GBSeq_other_seqids as $seqids) { if (preg_match('/gi\\|(?<gi>\\d+)$/', $seqids, $m)) { $genbank_sequence->gi = (int) $m['gi']; } } $genbank_sequence->references = array(); foreach ($GBSet->GBSeq_references as $GBReference) { $reference = new stdclass(); $reference->title = $GBReference->GBReference_title; $reference->citation = $GBReference->GBReference_journal; if (isset($GBReference->GBReference_authors)) { foreach ($GBReference->GBReference_authors as $a) { $parts = parse_name($a); $author = new stdClass(); $author->name = $a; if (isset($parts['last'])) { $author->lastname = $parts['last']; } if (isset($parts['first'])) { $author->forename = $parts['first']; if (array_key_exists('middle', $parts)) { $author->forename .= ' ' . $parts['middle']; } } $reference->author[] = $author; } } if (preg_match('/(?<journal>.*)\\s+(?<volume>\\d+)(\\s+\\((?<issue>.*)\\))?,\\s+(?<spage>\\d+)-(?<epage>\\d+)\\s+\\((?<year>[0-9]{4})\\)/', $reference->citation, $m)) { $reference->journal = new stdclass(); $reference->journal->name = $m['journal']; $reference->journal->volume = $m['volume']; if ($m['issue'] != '') { $reference->journal->issue = $m['issue']; } $reference->journal->pages = $m['spage']; if ($m['epage'] != '') { $reference->journal->pages .= '--' . $m['epage']; } } if (isset($GBReference->GBReference_pubmed)) { $identifier = new stdclass(); $identifier->type = 'pmid'; $identifier->id = (int) $GBReference->GBReference_pubmed; $reference->identifier[] = $identifier; } if (isset($GBReference->GBReference_xref)) { if ($GBReference->GBReference_xref->GBXref->GBXref_dbname == 'doi') { $identifier = new stdclass(); $identifier->type = 'doi'; $identifier->id = $GBReference->GBReference_xref->GBXref->GBXref_id; $reference->identifier[] = $identifier; } } $genbank_sequence->references[] = $reference; } foreach ($GBSet->GBSeq_feature_table as $feature_table) { switch ($feature_table->GBFeature_key) { case 'source': if (!isset($genbank_sequence->source)) { $genbank_sequence->source = new stdclass(); } foreach ($feature_table->GBFeature_quals as $feature_quals) { switch ($feature_quals->GBQualifier_name) { case 'db_xref': $genbank_sequence->source->tax_id = (int) str_replace("taxon:", '', $feature_quals->GBQualifier_value); break; case 'collection_date': $genbank_sequence->source->collection_date = $feature_quals->GBQualifier_value; break; case 'collected_by': $genbank_sequence->source->collected_by = $feature_quals->GBQualifier_value; break; case 'country': $genbank_sequence->source->country = $feature_quals->GBQualifier_value; break; case 'host': $genbank_sequence->source->host = $feature_quals->GBQualifier_value; break; case 'locality': $genbank_sequence->source->locality = $feature_quals->GBQualifier_value; break; case 'isolation_source': $genbank_sequence->source->isolation_source = $feature_quals->GBQualifier_value; break; case 'isolate': $genbank_sequence->source->isolate = $feature_quals->GBQualifier_value; break; case 'lat_lon': $genbank_sequence->source->lat_lon = $feature_quals->GBQualifier_value; break; case 'mol_type': $genbank_sequence->source->mol_type = $feature_quals->GBQualifier_value; break; case 'organelle': $genbank_sequence->source->organelle = $feature_quals->GBQualifier_value; break; case 'specimen_voucher': $genbank_sequence->source->specimen_voucher = $feature_quals->GBQualifier_value; break; default: break; } } if (isset($genbank_sequence->source->lat_lon)) { process_lat_lon($genbank_sequence); } process_locality($genbank_sequence); break; default: break; } } if (isset($genbank_sequence->source->latitude)) { $genbank_sequence->source->geometry = new stdclass(); $genbank_sequence->source->geometry->type = "Point"; $genbank_sequence->source->geometry->coordinates = array((double) $genbank_sequence->source->longitude, (double) $genbank_sequence->source->latitude); } } } return $genbank_sequence; }
function fetch_sequences($ids) { $hits = new stdclass(); $hits->ids = $ids; $hits->sequences = array(); $hits->geometry = new stdclass(); $hits->geometry->type = "MultiPoint"; $hits->geometry->coordinates = array(); // Query URL $url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=' . join(",", $ids) . '&rettype=gb&retmode=xml'; $xml = get($url); if ($xml != '') { $xp = new XsltProcessor(); $xsl = new DomDocument(); $xsl->load('xml2json.xslt'); $xp->importStylesheet($xsl); $dom = new DOMDocument(); $dom->loadXML($xml); $xpath = new DOMXPath($dom); $json = $xp->transformToXML($dom); // fix "-" in variable names $json = str_replace('"GBSeq_feature-table"', '"GBSeq_feature_table"', $json); $json = str_replace('"GBSeq_primary-accession"', '"GBSeq_primary_accession"', $json); $json = str_replace('"GBSeq_other-seqids"', '"GBSeq_other_seqids"', $json); $json = str_replace('"GBSeq_create-date"', '"GBSeq_create_date"', $json); $json = str_replace('"GBSeq_update-date"', '"GBSeq_update_date"', $json); $sequences = json_decode($json); foreach ($sequences->GBSet as $GBSet) { $genbank_sequence = new stdclass(); $genbank_sequence->accession = $GBSet->GBSeq_primary_accession; $genbank_sequence->organism = $GBSet->GBSeq_organism; $genbank_sequence->created = format_date($GBSet->GBSeq_create_date); $genbank_sequence->updated = format_date($GBSet->GBSeq_update_date); foreach ($GBSet->GBSeq_other_seqids as $seqids) { if (preg_match('/gi\\|(?<gi>\\d+)$/', $seqids, $m)) { $genbank_sequence->gi = $m['gi']; } } $genbank_sequence->references = array(); foreach ($GBSet->GBSeq_references as $GBReference) { $reference = new stdclass(); $reference->title = $GBReference->GBReference_title; $reference->citation = $GBReference->GBReference_journal; $reference->authors = array(); if (isset($GBReference->GBReference_authors)) { foreach ($GBReference->GBReference_authors as $author) { $reference->authors[] = $author; } } if (isset($GBReference->GBReference_pubmed)) { if (!isset($reference->identifiers)) { $reference->identifiers = new stdclass(); } $reference->identifiers->pmid = $GBReference->GBReference_pubmed; } if (isset($GBReference->GBReference_xref)) { if ($GBReference->GBReference_xref->GBXref->GBXref_dbname == 'doi') { if (!isset($reference->identifiers)) { $reference->identifiers = new stdclass(); } $reference->identifiers->doi = $GBReference->GBReference_xref->GBXref->GBXref_id; } } $genbank_sequence->references[] = $reference; } foreach ($GBSet->GBSeq_feature_table as $feature_table) { switch ($feature_table->GBFeature_key) { case 'source': foreach ($feature_table->GBFeature_quals as $feature_quals) { switch ($feature_quals->GBQualifier_name) { case 'db_xref': $genbank_sequence->source->tax_id = str_replace("taxon:", '', $feature_quals->GBQualifier_value); break; case 'country': $genbank_sequence->source->country = $feature_quals->GBQualifier_value; break; case 'host': $genbank_sequence->source->host = $feature_quals->GBQualifier_value; break; case 'locality': $genbank_sequence->source->locality = $feature_quals->GBQualifier_value; break; case 'isolation_source': $genbank_sequence->source->isolation_source = $feature_quals->GBQualifier_value; break; case 'isolate': $genbank_sequence->source->isolate = $feature_quals->GBQualifier_value; break; case 'lat_lon': $genbank_sequence->source->lat_lon = $feature_quals->GBQualifier_value; break; case 'specimen_voucher': $genbank_sequence->source->specimen_voucher = $feature_quals->GBQualifier_value; break; default: break; } } if (isset($genbank_sequence->source->lat_lon)) { process_lat_lon($genbank_sequence); } process_locality($genbank_sequence); break; default: break; } } if (isset($genbank_sequence->source->latitude)) { $hits->geometry->coordinates[] = array($genbank_sequence->source->longitude, $genbank_sequence->source->latitude); } $hits->sequences[] = $genbank_sequence; } } return $hits; }