public function checkUrl($url, $org_name) { $ret = false; if (preg_match('/\\/\\/[^\\/]+\\//isu', $url, $match)) { $url = $match[0]; } $parts = LsString::split($org_name); $all = ''; $no_common = ''; $no_corp = ''; $stripped = ''; $common = array('and', 'the', 'of', 'in', 'at', '&'); $abbrevs = array('Corporation', 'Inc', 'Group', 'LLC', 'LLP', 'Corp', 'Co', 'Cos', 'LP', 'PA', 'Dept', 'Department', 'International', 'Administration'); $both = array_merge($common, $abbrevs); foreach ($parts as $part) { if (!LsArray::inArrayNoCase($part, $common)) { $no_common .= $part[0]; } if (!LsArray::inArrayNoCase($part, $abbrevs)) { $no_corp .= $part[0]; } if (!LsArray::inArrayNoCase($part, $both)) { $stripped .= $part[0]; } $all .= $part[0]; if (stristr($url, $part) && strlen($part) > 1 && !LsArray::inArrayNoCase($part, $both)) { $ret = true; } } if ($ret == false) { if (strlen($all) > 2 && stristr($url, $all)) { $ret = true; } if (strlen($no_common) > 2 && stristr($url, $no_common)) { $ret = true; } if (strlen($no_corp) > 2 && stristr($url, $no_corp)) { $ret = true; } } return $ret; }
static function getNameWithLast($str, $last) { $re_last = LsString::escapeStringForRegex($last); //hyphens and spaces interchangeable in last names $re_last = preg_replace('/\\\\s+|\\\\\\-/is', '(\\s+|\\-)', $re_last); $matches = array(); $matched = preg_match_all('/\\b' . $re_last . '\\b/isu', $str, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); $name = null; foreach ($matches as $match) { $pos_last = $match[0][1]; if ($pos_last == 0) { return null; } $last = $match[0][0]; //work backwards from last name to find comma $pos = -1 * (strlen($str) - $pos_last); $comma = strripos($str, ',', $pos); $str = substr($str, $comma); $splat = preg_split('/\\b' . $re_last . '\\b/is', $str); $pre = $splat[0]; $post = $splat[1]; $arr = array_reverse(preg_split('/[\\s]+/', $pre, -1, PREG_SPLIT_NO_EMPTY)); $new = array(); foreach ($arr as $a) { if ($case = LsString::checkCase($a)) { if ($case == 'initial') { $new[] = $a; } else { if ($case == 'lower') { break; } else { if (preg_match('/\\.(\\P{L})*$/u', $a) == 1) { $a = LsString::stripNonAlpha($a); if ($s = LsArray::inArrayNoCase($a, PersonTable::$nameParsePrefixes)) { $new[] = $s; } break; } else { $new[] = $a; } } } } } $pre = implode(' ', array_reverse($new)); if (strlen(trim($pre)) == 0) { continue; } $arr = preg_split('/[\\s]+/', $post, -1, PREG_SPLIT_NO_EMPTY); $new = array(); foreach ($arr as $a) { if ($case = LsString::checkCase($a)) { if ($case == 'lower') { break; } $a = LsString::stripNonAlpha($a); if ($s = LsArray::inArrayNoCase($a, PersonTable::$nameParseSuffixes)) { $new[] = $s; } else { break; } } } $post = trim(implode(' ', $new)); $full = $pre . ' ' . $last; if (strlen($post) > 0) { $full .= ', ' . $post; } $name = array('nameFull' => $full, 'nameStart' => $pre, 'nameLast' => $last, 'namePost' => $post); } return $name; }
static function parseDescriptionStr($str, $entity = null) { $descriptions = array(); $remains = array(); //cleanup text to be parsed $str = trim($str); $str = preg_replace('/(?<!=\\s)\\.(?!=\\s)/', '', $str); $str = str_replace('.', ' ', $str); $str = preg_replace('/\\s{2,}/', ' ', $str); $str = preg_replace('/\\s+,(?=\\s)/', ',', $str); $str = preg_replace('/\\)\\s*$/', '', $str); if (strtolower($str) == 'see remarks') { $str = ''; } /* if ($entity) { $name_re = LsString::escapeStringForRegex($entity->name); $str = preg_replace('/\b' . $name_re . '\b/isu', '', $str); if ($entity->ticker) { $tick_re = LsString::escapeStringForRegex($entity->ticker); $str = preg_replace('/\b' . $tick_re . '\b/isu', '', $str); } } */ //don't parse if there's more than one separator $num = 0; $patterns = array('/\\s&\\s/', '/,/', '/;/', '/\\band\\b/i'); foreach ($patterns as $pattern) { if (preg_match($pattern, $str)) { $num++; } } if ($num > 1) { return array($str); } //split by commas $parts = preg_split('/,|;|\\band\\b|\\s&\\s/', $str, -1, PREG_SPLIT_NO_EMPTY); foreach ($parts as $part) { $part = trim($part); $part = preg_replace('/\\s{2,}/', ' ', $part); //abbreviation replacements $part = preg_replace('/( |^)(\\w) (\\w) (\\w)( |$)/', '\\2\\3\\4', $part); $part = preg_replace('/(Interim|Acting|Incoming) /i', '', $part); $part = preg_replace('/Sr /i', 'Senior ', $part); $part = preg_replace('/Chf /i', 'Chief ', $part); $part = preg_replace('/( |^)V( |$)/i', ' Vice ', $part); $part = preg_replace('/( |^)VP( |$)/i', ' Vice President ', $part); $part = preg_replace('/( |^)VC( |$)/i', ' Vice Chairman ', $part); $part = preg_replace('/( |^)Chr( |$)/i', ' Chairman ', $part); $part = preg_replace('/( |^)Ofcr( |$)/i', ' Officer ', $part); $part = preg_replace('/( |^)Vice P( |$)/i', ' Vice President ', $part); $part = preg_replace('/( |^)(Ex|Exec)( |$)/i', ' Executive ', $part); $part = preg_replace('/( |^)EVP( |$)/i', ' Executive Vice President ', $part); $part = preg_replace('/( |^)(Off|Offic|Offcr)( |$)/i', ' Officer ', $part); $part = str_replace('Gen ', 'General ', $part); $part = preg_replace('/( |^)(Op|Oper) /', ' Operating ', $part); $part = preg_replace('/( |^)(Bd|Brd)( |$)/i', ' Board ', $part); $part = preg_replace('/of Board/i', ' of the Board', $part); $part = preg_replace('/( |^)COB( |$)/i', ' Chairman of the Board ', $part); $part = preg_replace('/( |^)(Pres|Prs|Presid|Prsdt|Prsdnt)( |$)/i', ' President ', $part); $part = preg_replace('/( |^)Admin( |$)/i', ' Administrative ', $part); $part = preg_replace('/( |^)Info( |$)/i', ' Information ', $part); $part = preg_replace('/\\bComm\\b/i', 'Committee', $part); $part = preg_replace('/\\bInc\\b/i', '', $part); $part = preg_replace('/( |-|^)(Ch|Chm|Chmn|Chrm|Chrmn|Chair|Chairmain|Chariman)( |$)/i', '\\1Chairman ', $part); $part = preg_replace('/(Sec|Secr|Secy|Secretar|Secreta)( |$)/i', 'Secretary ', $part); $part = str_replace('Vice-', 'Vice ', $part); $part = preg_replace('/( |^)Non /i', ' Non-', $part); $part = preg_replace('/\\bCompl\\b/i', 'Compliance', $part); $part = str_ireplace('of Advisory', 'of the Advisory', $part); $part = preg_replace('/Advisory (Panel|Council)/i', 'Advisory Board', $part); $part = str_ireplace('Independent ', '', $part); $part = str_ireplace('Lead ', '', $part); $part = str_ireplace('Corporate ', '', $part); $part = str_ireplace('Outside ', '', $part); $part = str_ireplace('Non-interested', '', $part); $part = str_ireplace('Interested', '', $part); $part = str_replace('Main ', '', $part); $part = str_ireplace('Presiding ', '', $part); $part = str_ireplace('Founding ', '', $part); $part = str_ireplace('Acctg', 'Accounting', $part); $part = str_ireplace('Chairperson', 'Chairman', $part); $part = str_ireplace('Chairwoman', 'Chairman', $part); $part = str_ireplace("Gen'l", 'General', $part); $part = trim($part); $part = preg_replace('/\\s{2,}/', ' ', $part); $position = array('description' => null, 'note' => array()); if (LsArray::inArrayNoCase($part, PositionTable::$businessPositions)) { $descriptions[] = $part; } } if (!count($descriptions)) { $descriptions[] = $str; } return $descriptions; }
static function parseFlatName($str, $surname = null, $returnArray = false) { $namePrefix = $nameFirst = $nameMiddle = $nameLast = $nameSuffix = $nameNick = null; //to handle multi-word last names like Van der Twerp $sub = null; if ($surname) { $sub = preg_replace('/(^(\\P{L})+|(\\P{L})+$)/u', '', $surname); $sub = preg_replace('/\\s+/is', '_', $sub); $str = str_ireplace($surname, $sub, $str); } //trim and remove periods $str = trim(str_replace('.', ' ', $str)); //remove extra spaces $str = preg_replace('/\\s{2,}/', ' ', $str); //remove anything in parentheses at the end $str = preg_replace('/ \\([^\\)]+\\)/', '', $str); //get prefixes $prefixes = self::$nameParsePrefixes; while ($prefix = current($prefixes)) { if ($str != ($new = preg_replace('/^' . $prefix . ' /i', '', $str))) { if (!LsArray::inArrayNoCase($prefix, LsLanguage::$commonPrefixes)) { $namePrefix .= $prefix . ' '; } $str = trim($new); reset($prefixes); continue; } next($prefixes); } $namePrefix = $namePrefix ? trim($namePrefix) : null; //get suffixes $suffixes = self::$nameParseSuffixes; while ($suffix = current($suffixes)) { if ($str != ($new = preg_replace('/ ' . $suffix . '$/i', '', $str))) { $nameSuffix = $suffix . ' ' . $nameSuffix; $str = trim($new); reset($suffixes); continue; } next($suffixes); } $nameSuffix = $nameSuffix ? trim($nameSuffix) : null; //remove commas left over from suffixes $str = trim(str_replace(',', '', $str)); //find nickname in quotes if (preg_match('/["\']([\\S]+)[\'"]/', $str, $nickFound)) { $nameNick = $nickFound[1] ? $nickFound[1] : $nickFound[2]; $str = trim(preg_replace('/["\']([\\S]+)[\'"]/', '', $str)); } //condense multiple spaces $str = preg_replace('/\\s{2,}/', ' ', $str); //split into parts $parts = explode(' ', $str); switch (count($parts)) { case 1: if ($namePrefix) { $nameFirst = $namePrefix; $nameLast = $parts[0]; $namePrefix = null; } else { if ($nameSuffix) { $nameFirst = $parts[0]; $nameLast = $nameSuffix; $nameSuffix = null; } else { if (strtolower($sub) == strtolower($parts[0])) { $nameLast = $parts[0]; } else { $nameFirst = $parts[0]; } } } break; case 2: $nameFirst = $parts[0]; $nameLast = $parts[1]; break; case 3: $nameFirst = $parts[0]; $nameMiddle = $parts[1]; $nameLast = $parts[2]; break; default: $nameFirst = $parts[0]; $nameLast = $parts[count($parts) - 1]; for ($n = 1; $n < count($parts) - 1; $n++) { $nameMiddle .= $parts[$n] . ' '; } $nameMiddle = trim($nameMiddle); break; } $nameLast = str_replace('_', ' ', $nameLast); $name = array('name_first' => $nameFirst, 'name_last' => $nameLast, 'name_middle' => $nameMiddle, 'name_prefix' => $namePrefix, 'name_suffix' => $nameSuffix, 'name_nick' => $nameNick); foreach ($name as $nk => &$nv) { if ($nv && $nk != 'name_suffix' && $nk != 'name_prefix') { $nv = preg_replace('/^(\\P{L})+|(\\P{L})+$/u', '', $nv); $case = LsString::checkCase($nv); $nv = $case == 'upper' || $case == 'lower' ? LsLanguage::nameize($nv) : $nv; if ($nk != 'name_last') { $nv = LsLanguage::hgCaser($nv, false); } } } unset($nv); if ($returnArray) { return $name; } $person = new Entity(); $person->addExtension('Person'); $person->name_first = $name['name_first']; $person->name_middle = $name['name_middle']; $person->name_last = $name['name_last']; $person->name_nick = $name['name_nick']; $person->name_prefix = $name['name_prefix']; $person->name_suffix = $name['name_suffix']; return $person; }
public function parseDescriptionStr($str, $corp) { $descriptions = array(); $remains = array(); //cleanup text to be parsed $str = trim($str); $str = str_replace('.', ' ', $str); $str = preg_replace('/\\s{2,}/', ' ', $str); $name_re = LsString::escapeStringForRegex($corp->name); $str = preg_replace('/\\b' . $name_re . '\\b/isu', '', $str); if ($corp->name_nick) { $nick_re = LsString::escapeStringForRegex($corp->name_nick); $str = preg_replace('/\\b' . $nick_re . '\\b/isu', '', $str); } if ($corp->ticker) { $tick_re = LsString::escapeStringForRegex($corp->ticker); $str = preg_replace('/\\b' . $tick_re . '\\b/isu', '', $str); } //split by commas $parts = preg_split('/,|;|\\band\\b|(?<!C[Oo])\\-|\\bAND\\b|\\s&\\s|\\//', $str, -1, PREG_SPLIT_NO_EMPTY); foreach ($parts as $part) { $part = trim($part); $part = preg_replace('/\\s{2,}/', ' ', $part); //abbreviation replacements $part = preg_replace('/( |^)(\\w) (\\w) (\\w)( |$)/', '\\2\\3\\4', $part); $part = preg_replace('/(Interim|Acting|Incoming) /i', '', $part); $part = preg_replace('/Sr /i', 'Senior ', $part); $part = preg_replace('/Chf /i', 'Chief ', $part); $part = preg_replace('/( |^)V( |$)/i', ' Vice ', $part); $part = preg_replace('/( |^)VP( |$)/i', ' Vice President ', $part); $part = preg_replace('/( |^)VC( |$)/i', ' Vice Chairman ', $part); $part = preg_replace('/( |^)Chr( |$)/i', ' Chairman ', $part); $part = preg_replace('/( |^)Ofcr( |$)/i', ' Officer ', $part); $part = preg_replace('/( |^)Vice P( |$)/i', ' Vice President ', $part); $part = preg_replace('/( |^)(Ex|Exec)( |$)/i', ' Executive ', $part); $part = preg_replace('/( |^)EVP( |$)/i', ' Executive Vice President ', $part); $part = preg_replace('/( |^)(Off|Offic|Offcr)( |$)/i', ' Officer ', $part); $part = str_replace('Gen ', 'General ', $part); $part = preg_replace('/( |^)(Op|Oper) /', ' Operating ', $part); $part = preg_replace('/( |^)(Bd|Brd)( |$)/i', ' Board ', $part); $part = preg_replace('/of Board/i', ' of the Board', $part); $part = preg_replace('/( |^)COB( |$)/i', ' Chairman of the Board ', $part); $part = preg_replace('/( |^)(Pres|Prs|Presid|Prsdt|Prsdnt)( |$)/i', ' President ', $part); $part = preg_replace('/( |^)Admin( |$)/i', ' Administrative ', $part); $part = preg_replace('/( |^)Info( |$)/i', ' Information ', $part); $part = preg_replace('/\\bComm\\b/i', 'Committee', $part); $part = preg_replace('/\\bInc\\b/i', '', $part); $part = preg_replace('/( |-|^)(Ch|Chm|Chmn|Chrm|Chrmn|Chair|Chairmain|Chariman)( |$)/i', '\\1Chairman ', $part); $part = preg_replace('/(Sec|Secr|Secy|Secretar|Secreta)( |$)/i', 'Secretary ', $part); $part = str_replace('Vice-', 'Vice ', $part); $part = preg_replace('/( |^)Non /i', ' Non-', $part); $part = preg_replace('/\\bCompl\\b/i', 'Compliance', $part); $part = str_ireplace('of Advisory', 'of the Advisory', $part); $part = preg_replace('/Advisory (Panel|Council)/i', 'Advisory Board', $part); $part = str_ireplace('Independent ', '', $part); $part = str_ireplace('Lead ', '', $part); $part = str_ireplace('Corporate ', '', $part); $part = str_ireplace('Outside ', '', $part); $part = str_ireplace('Non-interested', '', $part); $part = str_ireplace('Interested', '', $part); $part = str_replace('Main ', '', $part); $part = str_ireplace('Presiding ', '', $part); $part = str_ireplace('Founding ', '', $part); $part = str_ireplace('Acctg', 'Accounting', $part); $part = str_ireplace('Chairperson', 'Chairman', $part); $part = str_ireplace('Chairwoman', 'Chairman', $part); $part = str_ireplace("Gen'l", 'General', $part); $part = trim($part); $part = preg_replace('/\\s{2,}/', ' ', $part); $position = array('description' => null, 'note' => array()); if ($part != '') { //look for matching title $p = LsArray::inArrayNoCase($part, PositionTable::$businessPositions); if ($p) { $position['description'] = $p; } else { if ($q = Doctrine::getTable('Relationship')->findOneByDescription1($position)) { $position['description'] = $q->description1; } else { if (count($descriptions) == 0) { $part_splat = LsString::split($part); $note = array(); //$this->printDebug($part); //var_dump($part_splat); $lim = count($part_splat) - 1; for ($i = 0; $i < $lim; $i++) { $note[] = array_pop($part_splat); $part_new = implode(' ', $part_splat); if (strtoupper($part_new) == 'DIRECTOR') { break; } $p = LsArray::inArrayNoCase($part_new, PositionTable::$businessPositions); if ($p) { $position['description'] = $p; } else { if ($q = Doctrine::getTable('Relationship')->findOneByDescription1($position)) { $position['description'] = $q->description1; } } } if (!$position['description']) { $position['description'] = $part; } } else { $descriptions[count($descriptions) - 1]['note'][] = $part; } } } if (isset($position['description'])) { $descriptions[] = $position; } } } return $descriptions; }