$aGenesAlreadyMappedTo[] = $aTranscript['gene']; } } else { // We don't have the gene that this transcript belongs to in the database yet. // Save it for now, we may need to add it later on if the variant can't be mapped to something that already exists in LOVD. $aGenesWeCanMapTo[] = $aTranscript['gene']; } } } $aGenesWeCanMapTo = array_unique($aGenesWeCanMapTo); if ($aVariant['mapping_flags'] & MAPPING_ALLOW_CREATE_GENES && count($aGenesWeCanMapTo)) { // We may add extra genes to map this variant to. $aGenes contains genes we can map to. // Try the genes one by one. foreach ($aGenesWeCanMapTo as $sGene) { // Get information from HGNC. $aGeneInfoFromHgnc = lovd_getGeneInfoFromHgncOld($sGene, array('gd_hgnc_id', 'gd_app_sym', 'gd_app_name', 'gd_pub_chrom_map', 'gd_locus_type', 'gd_pub_eg_id', 'md_mim_id', 'gd_pub_refseq_ids', 'md_refseq_id'), true); if (empty($aGeneInfoFromHgnc)) { // Couldn't find this gene. Try the next. continue; } list($sHgncID, $sSymbol, $sGeneName, $sChromLocation, $sLocusType, $sEntrez, $sOmim, $sRefseq1, $sRefseq2) = array_values($aGeneInfoFromHgnc); // Get LRG if it exists. if (!($sRefseqGenomic = lovd_getLRGbyGeneSymbol($sSymbol))) { // No LRG, get NG if it exists. if (!($sRefseqGenomic = lovd_getNGbyGeneSymbol($sSymbol))) { // Also no NG, use the NC instead. $sRefseqGenomic = $_SETT['human_builds'][$_CONF['refseq_build']]['ncbi_sequences'][$sChromosome]; } } // Get UD. try {
function lovd_getGeneInfoFromHgncOld($sHgncId, $aCols, $bRecursion = false) { // Downloads gene information from the HGNC website. The specified columns will be retrieved. // The first argument can be an HGNC accession number, an HGNC approved gene symbol, or boolean true to retrieve ALL genes. // The results will be returned as an associative array; in case all genes have been loaded an array of arrays is returned with gene symbols as keys. // If $bRecursion == true, this function automatically handles deprecated HGNC entries. // On error, this function calls lovd_errorAdd if inc-lib-form.php was included. It always returns false on failure. // Process columns. $aColumns = $aCols; // $aColumns will be extended with more information, whereas $aCols is used for the return value and as such should not be changed. $sColumns = ''; foreach ($aCols as $sColumn) { $sColumns .= 'col=' . $sColumn . '&'; } // Make sure we request the right data. if ($sHgncId === true) { // Boolean true; return bulk data. $sWhere = ''; // Using approved symbols as array keys, so we need to get them from the HGNC. if (!in_array('gd_app_sym', $aCols)) { $sColumns .= 'col=gd_app_sym&'; $aColumns[] = 'gd_app_sym'; } } else { if (ctype_digit($sHgncId)) { // HGNC database ID. $sWhere = 'gd_hgnc_id%3D' . $sHgncId; } else { // FIXME; implement proper check on gene symbol. // Gene symbol; also match SYMBOL~withdrawn to be able to use a deprecated symbol as search key. $sWhere = rawurlencode('gd_app_sym IN ("' . $sHgncId . '", "' . $sHgncId . '~withdrawn")'); } // We also surely need gd_app_name to check for and handle withdrawn or deprecated entries. if (!in_array('gd_app_name', $aCols)) { $sColumns .= 'col=gd_app_name&'; $aColumns[] = 'gd_app_name'; } } $aHgncFile = lovd_php_file('http://www.genenames.org/cgi-bin/download?' . $sColumns . 'status_opt=2&where=' . $sWhere . '&order_by=gd_app_sym_sort&limit=&format=text&submit=submit'); // If the HGNC is having database problems, we get an HTML page. if (empty($aHgncFile) || stripos(implode($aHgncFile), '<html') !== false) { if (function_exists('lovd_errorAdd')) { lovd_errorAdd('', 'Couldn\'t get gene information, probably because the HGNC is having database problems.'); } return false; } if ($sHgncId === true) { // Got bulk data. $aHGNCgenes = array(); array_shift($aHgncFile); foreach ($aHgncFile as $sGene) { $aGene = array_combine($aColumns, explode("\t", $sGene)); $sSymbol = str_replace('~withdrawn', '', $aGene['gd_app_sym']); if (!empty($aHGNCgenes[$sSymbol]) && $sSymbol != $aGene['gd_app_sym']) { // Symbol has been deprecated and then reassigned to another gene, don't overwrite that one. continue; } $aHGNCgenes[$sSymbol] = $aGene; foreach (array_diff($aColumns, $aCols) as $sUnwantedColumn) { // Don't return columns the caller hasn't asked for. unset($aHGNCgenes[$sSymbol][$sUnwantedColumn]); } } return $aHGNCgenes; } // Requested single entry. if (isset($aHgncFile[1])) { // Looks like we've got valid data here. $aGene = array_combine($aColumns, explode("\t", $aHgncFile[1])); // We might encorporate one or more of these locus types excludes later, so that we can throw an error without first calling mutalyzer a number of times. //$aBadLocusTypes = array('RNA, cluster', 'RNA, transfer', 'RNA, ribosomal', 'transposable element', 'virus integration site', 'phenotype only', 'unknown', 'region', 'complex locus constituent', 'endogenous retrovirus', 'fragile site', 'T cell receptor gene', 'T cell receptor pseudogene'); $aBadLocusTypes = array('phenotype only'); if ($aGene['gd_app_name'] == 'entry withdrawn') { if (function_exists('lovd_errorAdd')) { lovd_errorAdd('hgnc_id', 'Entry ' . htmlspecialchars($sHgncId) . ' no longer exists in the HGNC database.'); } return false; } elseif (preg_match('/^symbol withdrawn, see (.+)$/', $aGene['gd_app_name'], $aRegs)) { if ($bRecursion) { return lovd_getGeneInfoFromHgncOld($aRegs[1], $aCols); } elseif (function_exists('lovd_errorAdd')) { lovd_errorAdd('hgnc_id', 'Entry ' . htmlspecialchars($sHgncId) . ' is deprecated according to the HGNC, please use ' . $aRegs[1] . '.'); } return false; } elseif (in_array('gd_pub_chrom_map', $aCols) && $aGene['gd_pub_chrom_map'] == 'reserved') { if (function_exists('lovd_errorAdd')) { lovd_errorAdd('hgnc_id', 'Entry ' . htmlspecialchars($sHgncId) . ' does not yet have a public association with a chromosomal location'); } return false; } elseif (in_array('gd_locus_type', $aCols) && in_array($aGene['gd_locus_type'], $aBadLocusTypes)) { if (function_exists('lovd_errorAdd')) { lovd_errorAdd('hgnc_id', 'LOVD cannot process this type of gene entry ' . htmlspecialchars($sHgncId) . ' (Locus Type: ' . $aGene['gd_locus_type'] . ').'); } return false; } foreach (array_diff($aColumns, $aCols) as $sUnwantedColumn) { // Don't return columns the caller hasn't asked for. unset($aGene[$sUnwantedColumn]); } // 2016-09-14; 3.0-17; HGNC can return multiple OMIM IDs. if (isset($aGene['md_mim_id']) && preg_match('/^(\\d+), /', $aGene['md_mim_id'], $aRegs)) { // Just trim the other(s) off. $aGene['md_mim_id'] = $aRegs[1]; } return $aGene; } elseif (function_exists('lovd_errorAdd')) { // No math found, start looking for alias. We could have included an OR in the original search, but I am not // sure if that would maybe have other genes pop up while the official gene is then ignored. // We only do this search, if we can report if of course (hence the check for lovd_errorAdd()). // Replace WHERE. $sWhere = rawurlencode('CONCAT(" ", gd_aliases, ",") LIKE "% ' . $sHgncId . ',%"'); $aHgncFile = lovd_php_file('http://www.genenames.org/cgi-bin/download?' . $sColumns . 'status_opt=2&where=' . $sWhere . '&order_by=gd_app_sym_sort&limit=&format=text&submit=submit'); // Just quick check if we have a match now... if (!empty($aHgncFile) && stripos(implode($aHgncFile), '<html') === false) { unset($aHgncFile[0]); $sSymbolList = ''; foreach ($aHgncFile as $sLine) { $aGene = array_combine($aColumns, explode("\t", $sLine)); $sSymbolList .= (!$sSymbolList ? '' : ', ') . $aGene['gd_app_sym']; } if ($sSymbolList) { // "Prettify" the output by replacing the last , by an "or". $sSymbolList = preg_replace('/, ([^ ]+)$/', " or \$1", $sSymbolList); lovd_errorAdd('hgnc_id', 'Entry ' . htmlspecialchars($sHgncId) . ' was not found, perhaps you are referring to ' . $sSymbolList . '?'); } else { lovd_errorAdd('hgnc_id', 'Entry ' . htmlspecialchars($sHgncId) . ' was not found in the HGNC database.'); } } } return false; }
// Now that we've parsed the transcript accession number, let's see if we need to do anything with its gene. if ($sSymbol != 'none' && empty($aGenesChecked[$sSymbol])) { // We haven't seen this gene before in this upload. // First try to get this gene from the database. if ($aGene = $_DB->query('SELECT g.refseq_UD, g.name, IF(IFNULL(t.position_g_mrna_start, 0) = IFNULL(t.position_g_mrna_end, 0), NULL, IF(t.position_g_mrna_start < t.position_g_mrna_end, "+", "-")) AS strand FROM ' . TABLE_GENES . ' AS g LEFT JOIN ' . TABLE_TRANSCRIPTS . ' AS t ON (g.id = t.geneid) WHERE g.id = ? ORDER BY t.id ASC LIMIT 1', array($sSymbol))->fetchAssoc()) { // We've got it in the database. Check its columns. $aGene['columns'] = $_DB->query('SELECT colid FROM ' . TABLE_SHARED_COLS . ' WHERE geneid = ? AND colid IN("' . implode('", "', $aVOTCols) . '")', array($sSymbol))->fetchAllColumn(); $aGenesChecked[$sSymbol] = $aGene; } elseif (strpos($_POST['autocreate'], 'g') !== false) { // We don't have this gene in the database yet. Try to add it instead. $_BAR->setMessage('Loading gene information for ' . $sSymbol . '...', 'done'); if (empty($aGeneInfo)) { // Getting all gene information from the HGNC takes a few seconds. $_BAR->setMessage('Loading gene data...', 'done'); $aGeneInfo = lovd_getGeneInfoFromHgncOld(true, array('gd_hgnc_id', 'gd_app_sym', 'gd_app_name', 'gd_pub_chrom_map', 'gd_locus_type', 'gd_pub_eg_id', 'md_mim_id')); if (empty($aGeneInfo)) { // We can't gene information from the HGNC, so we can't add them. // This is a major problem and we can't just continue; the user will have to give permission not to create new gene entries. ob_start(); lovd_showInfoTable('Could not get any gene information from the HGNC database! If this problem persists, consider importing the file without creating new gene entries. ' . 'If this is not an option for you, please try again later.', 'stop'); $_BAR->setMessage(ob_get_clean(), 'done'); exit('</BODY></HTML>'); } // Remove the Loading gene data...' message again. $_BAR->setMessage('Loading gene information for ' . $sSymbol . '...', 'done'); } // Get HGNC data for this gene. while (true) { if (empty($aGeneInfo[$sSymbol]) || $aGeneInfo[$sSymbol]['gd_app_name'] == 'entry withdrawn' || $aGeneInfo[$sSymbol]['gd_pub_chrom_map'] == 'reserved') { // Can't use this symbol.