function split_full_name($full_name)
{
    $full_name = trim($full_name);
    // split into words
    $unfiltered_name_parts = explode(" ", $full_name);
    // completely ignore any words in parentheses
    foreach ($unfiltered_name_parts as $word) {
        if ($word[0] != "(") {
            $name_parts[] = $word;
        }
    }
    $num_words = sizeof($name_parts);
    // is the first word a title? (Mr. Mrs, etc)
    $salutation = is_salutation($name_parts[0]);
    $suffix = is_suffix($name_parts[sizeof($name_parts) - 1]);
    // set the range for the middle part of the name (trim prefixes & suffixes)
    $start = $salutation ? 1 : 0;
    $end = $suffix ? $num_words - 1 : $num_words;
    // concat the first name
    for ($i = $start; $i < $end - 1; $i++) {
        $word = $name_parts[$i];
        // move on to parsing the last name if we find an indicator of a compound last name (Von, Van, etc)
        // we use $i != $start to allow for rare cases where an indicator is actually the first name (like "Von Fabella")
        if (is_compound_lname($word) && $i != $start) {
            break;
        }
        // is it a middle initial or part of their first name?
        // if we start off with an initial, we'll call it the first name
        if (is_initial($word)) {
            // is the initial the first word?
            if ($i == $start) {
                // if so, do a look-ahead to see if they go by their middle name
                // for ex: "R. Jason Smith" => "Jason Smith" & "R." is stored as an initial
                // but "R. J. Smith" => "R. Smith" and "J." is stored as an initial
                if (is_initial($name_parts[$i + 1])) {
                    $fname .= " " . strtoupper($word);
                } else {
                    $initials .= " " . strtoupper($word);
                }
                // otherwise, just go ahead and save the initial
            } else {
                $initials .= " " . strtoupper($word);
            }
        } else {
            $fname .= " " . fix_case($word);
        }
    }
    // check that we have more than 1 word in our string
    if ($end - $start > 1) {
        // concat the last name
        for ($i; $i < $end; $i++) {
            $lname .= " " . fix_case($name_parts[$i]);
        }
    } else {
        // otherwise, single word strings are assumed to be first names
        $fname = fix_case($name_parts[$i]);
    }
    // return the various parts in an array
    $name['salutation'] = $salutation;
    $name['fname'] = trim($fname);
    $name['initials'] = trim($initials);
    $name['lname'] = trim($lname);
    $name['suffix'] = $suffix;
    return $name;
}
Beispiel #2
0
function tokenize_ml($txt, $exceptions, $prefixes)
{
    $coeff = array();
    $out = array();
    $token = '';
    $txt = Normalizer::normalize($txt, Normalizer::FORM_C);
    $res = sql_query("SELECT * FROM tokenizer_coeff");
    while ($r = sql_fetch_array($res)) {
        $coeff[$r[0]] = $r[1];
    }
    $txt .= '  ';
    for ($i = 0; $i < mb_strlen($txt, 'UTF-8'); ++$i) {
        $prevchar = $i > 0 ? mb_substr($txt, $i - 1, 1, 'UTF-8') : '';
        $char = mb_substr($txt, $i + 0, 1, 'UTF-8');
        $nextchar = mb_substr($txt, $i + 1, 1, 'UTF-8');
        $nnextchar = mb_substr($txt, $i + 2, 1, 'UTF-8');
        //$chain is the current word which we will perhaps need to check in the dictionary
        $chain = $chain_left = $chain_right = '';
        $odd_symbol = '';
        if (is_hyphen($char) || is_hyphen($nextchar)) {
            $odd_symbol = '-';
        } elseif (preg_match('/([\\.\\/\\?\\=\\:&"!\\+\\(\\)])/u', $char, $match) || preg_match('/([\\.\\/\\?\\=\\:&"!\\+\\(\\)])/u', $nextchar, $match)) {
            $odd_symbol = $match[1];
        }
        if ($odd_symbol) {
            for ($j = $i; $j >= 0; --$j) {
                $t = mb_substr($txt, $j, 1, 'UTF-8');
                if ($odd_symbol == '-' && (is_cyr($t) || is_hyphen($t) || $t === "'") || $odd_symbol != '-' && !is_space($t)) {
                    $chain_left = $t . $chain_left;
                } else {
                    break;
                }
                if (mb_substr($chain_left, -1) === $odd_symbol) {
                    $chain_left = mb_substr($chain_left, 0, -1);
                }
            }
            for ($j = $i + 1; $j < mb_strlen($txt, 'UTF-8'); ++$j) {
                $t = mb_substr($txt, $j, 1, 'UTF-8');
                if ($odd_symbol == '-' && (is_cyr($t) || is_hyphen($t) || $t === "'") || $odd_symbol != '-' && !is_space($t)) {
                    $chain_right .= $t;
                } else {
                    break;
                }
                if (mb_substr($chain_right, 0, 1) === $odd_symbol) {
                    $chain_right = mb_substr($chain_right, 1);
                }
            }
            $chain = $chain_left . $odd_symbol . $chain_right;
        }
        $vector = array_merge(char_class($char), char_class($nextchar), array(is_number($prevchar), is_number($nnextchar), $odd_symbol == '-' ? is_dict_chain($chain) : 0, $odd_symbol == '-' ? is_suffix($chain_right) : 0, is_same_pm($char, $nextchar), $odd_symbol && $odd_symbol != '-' ? looks_like_url($chain, $chain_right) : 0, $odd_symbol && $odd_symbol != '-' ? is_exception($chain, $exceptions) : 0, $odd_symbol == '-' ? is_prefix($chain_left, $prefixes) : 0, $odd_symbol == ':' && $chain_right !== '' ? looks_like_time($chain_left, $chain_right) : 0));
        $vector = implode('', $vector);
        if (isset($coeff[bindec($vector)])) {
            $sum = $coeff[bindec($vector)];
        } else {
            $sum = 0.5;
        }
        $token .= $char;
        if ($sum > 0) {
            $token = trim($token);
            if ($token !== '') {
                $out[] = array($token, $sum, bindec($vector) . '=' . $vector);
            }
            $token = '';
        }
    }
    return $out;
}