function split_full_name($full_name) { $full_name = trim($full_name); // split into words $unfiltered_name_parts = explode(" ", $full_name); // completely ignore any words in parentheses foreach ($unfiltered_name_parts as $word) { if ($word[0] != "(") { $name_parts[] = $word; } } $num_words = sizeof($name_parts); // is the first word a title? (Mr. Mrs, etc) $salutation = is_salutation($name_parts[0]); $suffix = is_suffix($name_parts[sizeof($name_parts) - 1]); // set the range for the middle part of the name (trim prefixes & suffixes) $start = $salutation ? 1 : 0; $end = $suffix ? $num_words - 1 : $num_words; // concat the first name for ($i = $start; $i < $end - 1; $i++) { $word = $name_parts[$i]; // move on to parsing the last name if we find an indicator of a compound last name (Von, Van, etc) // we use $i != $start to allow for rare cases where an indicator is actually the first name (like "Von Fabella") if (is_compound_lname($word) && $i != $start) { break; } // is it a middle initial or part of their first name? // if we start off with an initial, we'll call it the first name if (is_initial($word)) { // is the initial the first word? if ($i == $start) { // if so, do a look-ahead to see if they go by their middle name // for ex: "R. Jason Smith" => "Jason Smith" & "R." is stored as an initial // but "R. J. Smith" => "R. Smith" and "J." is stored as an initial if (is_initial($name_parts[$i + 1])) { $fname .= " " . strtoupper($word); } else { $initials .= " " . strtoupper($word); } // otherwise, just go ahead and save the initial } else { $initials .= " " . strtoupper($word); } } else { $fname .= " " . fix_case($word); } } // check that we have more than 1 word in our string if ($end - $start > 1) { // concat the last name for ($i; $i < $end; $i++) { $lname .= " " . fix_case($name_parts[$i]); } } else { // otherwise, single word strings are assumed to be first names $fname = fix_case($name_parts[$i]); } // return the various parts in an array $name['salutation'] = $salutation; $name['fname'] = trim($fname); $name['initials'] = trim($initials); $name['lname'] = trim($lname); $name['suffix'] = $suffix; return $name; }
function tokenize_ml($txt, $exceptions, $prefixes) { $coeff = array(); $out = array(); $token = ''; $txt = Normalizer::normalize($txt, Normalizer::FORM_C); $res = sql_query("SELECT * FROM tokenizer_coeff"); while ($r = sql_fetch_array($res)) { $coeff[$r[0]] = $r[1]; } $txt .= ' '; for ($i = 0; $i < mb_strlen($txt, 'UTF-8'); ++$i) { $prevchar = $i > 0 ? mb_substr($txt, $i - 1, 1, 'UTF-8') : ''; $char = mb_substr($txt, $i + 0, 1, 'UTF-8'); $nextchar = mb_substr($txt, $i + 1, 1, 'UTF-8'); $nnextchar = mb_substr($txt, $i + 2, 1, 'UTF-8'); //$chain is the current word which we will perhaps need to check in the dictionary $chain = $chain_left = $chain_right = ''; $odd_symbol = ''; if (is_hyphen($char) || is_hyphen($nextchar)) { $odd_symbol = '-'; } elseif (preg_match('/([\\.\\/\\?\\=\\:&"!\\+\\(\\)])/u', $char, $match) || preg_match('/([\\.\\/\\?\\=\\:&"!\\+\\(\\)])/u', $nextchar, $match)) { $odd_symbol = $match[1]; } if ($odd_symbol) { for ($j = $i; $j >= 0; --$j) { $t = mb_substr($txt, $j, 1, 'UTF-8'); if ($odd_symbol == '-' && (is_cyr($t) || is_hyphen($t) || $t === "'") || $odd_symbol != '-' && !is_space($t)) { $chain_left = $t . $chain_left; } else { break; } if (mb_substr($chain_left, -1) === $odd_symbol) { $chain_left = mb_substr($chain_left, 0, -1); } } for ($j = $i + 1; $j < mb_strlen($txt, 'UTF-8'); ++$j) { $t = mb_substr($txt, $j, 1, 'UTF-8'); if ($odd_symbol == '-' && (is_cyr($t) || is_hyphen($t) || $t === "'") || $odd_symbol != '-' && !is_space($t)) { $chain_right .= $t; } else { break; } if (mb_substr($chain_right, 0, 1) === $odd_symbol) { $chain_right = mb_substr($chain_right, 1); } } $chain = $chain_left . $odd_symbol . $chain_right; } $vector = array_merge(char_class($char), char_class($nextchar), array(is_number($prevchar), is_number($nnextchar), $odd_symbol == '-' ? is_dict_chain($chain) : 0, $odd_symbol == '-' ? is_suffix($chain_right) : 0, is_same_pm($char, $nextchar), $odd_symbol && $odd_symbol != '-' ? looks_like_url($chain, $chain_right) : 0, $odd_symbol && $odd_symbol != '-' ? is_exception($chain, $exceptions) : 0, $odd_symbol == '-' ? is_prefix($chain_left, $prefixes) : 0, $odd_symbol == ':' && $chain_right !== '' ? looks_like_time($chain_left, $chain_right) : 0)); $vector = implode('', $vector); if (isset($coeff[bindec($vector)])) { $sum = $coeff[bindec($vector)]; } else { $sum = 0.5; } $token .= $char; if ($sum > 0) { $token = trim($token); if ($token !== '') { $out[] = array($token, $sum, bindec($vector) . '=' . $vector); } $token = ''; } } return $out; }