/** * Specify a range of characters in the following format: * 1-15,10,1,12,423,312312,123-124. If you specify * a range, that is not valid, that portion will be ignored. */ function expand_number_range($range) { $retval = ''; $i = 0; $number = ''; $left_number = ''; $right_number = ''; while ($i < strlen($range)) { if (is_numeric($range[$i])) { if (is_numeric($left_number)) { $right_number .= $range[$i]; } else { $number .= $range[$i]; } } else { if ($range[$i] == '-') { $left_number = $number; //reset $number = ''; } else { if ($range[$i] == ',') { if (is_numeric($left_number) && is_numeric($right_number)) { $retval .= expand_range($left_number, $right_number); //reset $left_number = ''; $right_number = ''; } else { $retval .= $number; //reset $number = ''; } $retval .= ','; } } } $i++; } if (is_numeric($left_number) && is_numeric($right_number)) { $retval .= expand_range($left_number, $right_number); } else { $retval .= $number; } // get rid of last character, if a comma. if ($retval[strlen($retval) - 1] == ',') { $retval = substr($retval, 0, strlen($retval) - 1); } return $retval; }
function extract_specimen_codes($t) { // Standard acronyms that have simple [Acronym] [number] specimen codes // (allowing for a prefix before [number] $acronyms = array('ABTC', 'ADT-CRBMUV', 'AM M', 'AMCC', 'AMNH', 'ANSP', 'ANWC', 'AMS', 'AMS\\. [1|I]\\.', 'ANSP', 'ASIZB', 'ASU', 'BBM', 'BNHS', 'BPBM', 'CAS', 'CASENT', 'CAS-SU', 'CFBH', 'CM', 'CMK', 'CRBMUV', 'CWM', 'DHMECN', 'FMNH', 'HKU', 'IBUNAM-EM', 'ICN', 'ICN-MHN-CR', 'ILPLA', 'INHS', 'IRSNB', 'IZUA', 'JAC', 'JCV', 'JM', 'KFBG', 'KU', 'KUHE', 'LACM', 'LSUMZ', 'MACN', 'MACN-Ict', 'MCP', 'MCNU', 'MCSN', 'MCZ', 'MFA-ZV-I', 'MHNCI', 'MNCN', 'MHNG', 'MHNUC', 'MNRJ', 'MPEG', 'MRAC', 'MRT', 'MUJ', 'MVUP', 'MVZ', 'MZUC', 'MZUFV', 'MZUSP', 'NHMW', 'NRM', 'NSV', 'NT', 'NTM', 'OMNH', 'QCAZ', 'QM', 'QMJ', 'RAN', 'RMNH', 'ROM', 'SAMA', 'SIUC', 'TNHC', 'THNHM', 'UAZ', 'UCR', 'UFMG', 'UMFS', 'UMMZ', 'UNT', 'USNM', 'USNM\\.', 'USNMENT', 'USNM\\sENT', 'UTA', 'UWBM', 'WAM', 'WHT', 'YPM', 'ZFMK', 'ZMA', 'ZMB', 'ZMH', 'ZRC', 'ZSI F', 'ZUFRJ'); $specimens = array(); $ids = array(); // Try and match typical code [A-Z] \d+, allowing for some quirks such as // letter prefixes for number, and support ranges if (preg_match_all('/ (?<code> ' . join("|", $acronyms) . ' ) \\s* (:|_|\\-)? (?<number>((?<prefix>(J|R|A[\\.|\\s]?|A\\-))?[0-9]{3,})) ( (\\-|–||—) (?<end>[0-9]{2,}) )? /x', $t, $out, PREG_PATTERN_ORDER)) { //print_r($out); $found = true; for ($i = 0; $i < count($out[0]); $i++) { $s = new stdClass(); $s->code = $out['code'][$i]; $s->prefix = $out['prefix'][$i]; $s->number = $out['number'][$i]; $s->end = $out['end'][$i]; array_push($specimens, $s); } } // Special cases ------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------- // BMNH, e.g. BMNH1947.2.26.89 if (preg_match_all('/ (?<code>BMNH) \\s* (?<number>([0-9]{2,4}(\\.[0-9]+)+) ) ( (\\-|–||—) (?<end>[0-9]+) )? /x', $t, $out, PREG_PATTERN_ORDER)) { //print_r($out); $found = true; for ($i = 0; $i < count($out[0]); $i++) { $s = new stdClass(); $s->code = $out['code'][$i]; $s->prefix = ''; $s->number = $out['number'][$i]; $s->end = $out['end'][$i]; array_push($specimens, $s); } //print_r($specimens); } // --------------------------------------------------------------------------------------------- // MNHN if (preg_match_all('/ (?<code>MNHN) \\s* (?<number>([0-9]{4}\\.[0-9]+) ) ( (\\-|–||—) (?<end>[0-9]+) )? /x', $t, $out, PREG_PATTERN_ORDER)) { //print_r($out); $found = true; for ($i = 0; $i < count($out[0]); $i++) { $s = new stdClass(); $s->code = $out['code'][$i]; $s->prefix = ''; $s->number = $out['number'][$i]; $s->end = $out['end'][$i]; array_push($specimens, $s); } //print_r($specimens); } // --------------------------------------------------------------------------------------------- if (preg_match_all('/ (?<code>NCA|QVM|ZSM) \\s* (?<number>([0-9]+(:|\\/)[0-9]+)) /x', $t, $out, PREG_PATTERN_ORDER)) { //print_r($out); $found = true; for ($i = 0; $i < count($out[0]); $i++) { $s = new stdClass(); $s->code = $out['code'][$i]; $s->number = $out['number'][$i]; array_push($specimens, $s); } //print_r($specimens); } // --------------------------------------------------------------------------------------------- if (preg_match_all('/ (?<code>NHM) \\s+ (?<number>(R\\.?[0-9]+)) /x', $t, $out, PREG_PATTERN_ORDER)) { //print_r($out); $found = true; for ($i = 0; $i < count($out[0]); $i++) { $s = new stdClass(); $s->code = $out['code'][$i]; $s->number = $out['number'][$i]; array_push($specimens, $s); } //print_r($specimens); } // --------------------------------------------------------------------------------------------- // Post process to handle lists of specimens foreach ($specimens as $z) { // Fix any codes that seem broken if ($z->code == 'USNM ENT') { $z->code = 'USNMENT'; } if ($z->code == 'USNM.') { $z->code = 'USNM'; } if ($z->end == '') { $ids[] = $z->code . ' ' . $z->number; } else { $range = expand_range($z->number, $z->end); foreach ($range as $r) { $ids[] = $z->code . ' ' . $r; } } } return $ids; }