public function hasSimilarName($str, $strict = false) { $str = OrgTable::removeSuffixes($str); $str = trim($str); if (!strlen($str)) { return false; } $terms = LsQuery::splitSearchPhrase($str); $matched = false; $names = $this->Entity->getAllNames(); foreach ($terms as &$term) { if (is_array($term)) { foreach ($term as &$t) { $t = LsString::escapeStringForRegex($t); } $term = implode('|', $term); } else { $term = LsString::escapeStringForRegex($term); } } unset($term); if ($terms[0] == 'The') { array_shift($terms); } foreach ($names as $name) { $matched = true; if (!preg_match('/^(The\\s+)?(' . $terms[0] . ')/isu', $name)) { $matched = false; continue; } foreach ($terms as $term) { $new = preg_replace('/((^|\\s)|\\b)(' . $term . ')(\\b|(\\s|$))/isu', ' ', $name, 1); if ($new == $name) { $matched = false; continue; } $name = $new; } $name = trim(OrgTable::removeSuffixes($name)); if ($strict && $matched && strlen($name) > 0 && count(LsString::split($name)) >= $strict) { $matched = false; } if ($matched == true) { break; } } return $matched; }
static function cleanNameForCategoryMatching($name) { $name = strtolower(OrgTable::removeSuffixes($name)); $name = str_replace("'", "", $name); return $name; }
public static function getByExtensionAndNameQuery($extensions, $str, $strength = 1) { $extensions = (array) $extensions; $str = trim($str); if (in_array('Org', $extensions)) { $str = OrgTable::removeSuffixes($str, $exclude = array('Bancorp')); } if (strlen($str) < 3) { return array(); } $q = EntityTable::getByExtensionQuery($extensions)->leftJoin('e.Alias a'); $search_queries = array($str); $arr = array('\\.' => ' ', '\\.' => '', '\\s&\\s' => ' and ', '\\sand\\s' => ' & ', ' & ' => ' ', ',' => '', '\\bUS\\b' => 'United States', 'United States\\b' => 'US'); $i = 0; while ($i < count($search_queries)) { $name = $search_queries[$i]; $i++; if (strlen($name) < 3) { break; } foreach ($arr as $k => $v) { $new = preg_replace('/' . $k . '/isu', $v, $name); if ($new != $name) { if (!in_array($new, $search_queries)) { $search_queries[] = $new; } } } } foreach ($search_queries as &$s) { if ($strength == 0) { $s = '%' . $s . '%'; } else { if ($strength == 1) { $s .= '%'; } } } unset($s); $e = implode(' or ', array_fill(0, count($search_queries), 'e.name like ?')); $a = implode(' or ', array_fill(0, count($search_queries), 'a.name like ?')); $search_queries = array_merge($search_queries, $search_queries); $q->addWhere($e . ' or ' . $a, $search_queries); return $q; }
static function getNameRegex($name) { $name = OrgTable::removeSuffixes($name); $name = LsString::escapeStringForRegex($name); $name = str_replace(',', ',?', $name); $name = str_replace('\\s+', ',?\\s+', $name); $name = str_replace('&', '(&|\\s*and\\s*)', $name); return $name; }
protected function processRow($row) { foreach ($row as &$r) { $r = trim($r); } $edit = array('Search Name' => $row['name'], 'Affiliation Name' => $row['affiliation1'], 'Similar Names' => array(), 'New Person' => null, 'Existing Person' => null, 'New Org' => null, 'Existing Org' => null, 'New Relationship' => null); try { $this->db->beginTransaction(); $person = null; $search_person = PersonTable::parseFlatName($row['name']); $similar = $search_person->getSimilarEntitiesQuery(true)->execute(); $matched_bio = false; $similar_ids = array(); foreach ($similar as $s) { $similar_ids[] = $s->id; $sim_re = LsString::escapeStringForRegex($s->name_first); $search_re = LsString::escapeStringForRegex($search_person->name_first); if (preg_match('/^' . $sim_re . '/su', $search_person->name_first) == 0 && preg_match('/^' . $search_re . '/su', $s->name_first) == 0) { continue; } $matched = false; $affils = array(); $ct = 1; $matched_affils = array(); $unmatched_affils = array(); while (isset($row['affiliation' . $ct]) && trim($row['affiliation' . $ct]) != '') { $affil = trim($row['affiliation' . $ct]); $org = $s->checkAffiliations(array($affil)); if ($org) { $matched_affils[] = array($org, $affil); $edit['Existing Org'] = $org->id; break; } else { $unmatched_affils[] = $affil; } $ct++; } if (count($matched_affils)) { $person = $s; break; //$ret[] = array('person' => $s, $matched_affils, $unmatched_affils); } else { /*$str = implode(' ', $unmatched_affils); if (isset($row['bio'])) { $str .= ' ' . $row['bio']; }*/ $bio = $s->getExtendedBio(); foreach ($unmatched_affils as $affil) { $affil = OrgTable::removeSuffixes($affil); $this->printDebug($affil); $this->printDebug($bio); if (preg_match('/' . OrgTable::getNameRegex($affil) . '/su', $bio)) { $matched_bio = true; break; } } if ($matched_bio) { $person = $s; break; } else { $this->printDebug(' ' . $s->name . ' failed'); } } } $edit['Similar Names'] = array_slice($similar_ids, 0, 5); $no_match = false; if (!$person) { if (isset($row['bio']) && trim($row['bio']) != '') { $search_person->summary = $row['bio']; } $search_person->save(); $this->printDebug(' not found, new person saved: ' . $search_person->name); $search_person->addReference($this->source_url, null, null, $this->source_name); $no_match = true; $edit['New Person'] = $search_person->id; $person = $search_person; } else { if (isset($row['bio']) && trim($row['bio']) != '' && !$person->summary) { $person->summary = $row['bio']; $person->save(); } $this->printDebug(' **person found: ' . $person->name); $edit['Existing Person'] = $person->id; } if ($matched_bio || $no_match) { $orgs = OrgTable::getOrgsWithSimilarNames($row['affiliation1'], true); $max = -1; $affiliated_org = null; foreach ($orgs as $org) { $this->printDebug(' found match: ' . $org->name); $ct = $org->getRelatedEntitiesQuery('Person', RelationshipTable::POSITION_CATEGORY, null, null, null, false, 2)->count(); if ($ct > $max) { $affiliated_org = $org; $edit['Existing Org'] = $affiliated_org->id; $max = $ct; } } if (!$affiliated_org) { $affiliated_org = new Entity(); $affiliated_org->addExtension('Org'); if (isset($row['affiliation1_extensions']) && $row['affiliation1_extensions'] != '') { $extensions = explode(',', $row['affiliation1_extensions']); foreach ($extensions as $ext) { $ext = trim($ext); if (in_array($ext, ExtensionDefinitionTable::$extensionNames)) { $affiliated_org->addExtension($ext); } } } else { //$affiliated_org->addExtension('Business'); } $affiliated_org->name = $row['affiliation1']; $affiliated_org->save(); $affiliated_org->addReference($this->source_url, null, null, $this->source_name); $edit['New Org'] = $affiliated_org->id; } $rel = new Relationship(); $rel->Entity1 = $person; $rel->Entity2 = $affiliated_org; $rel->setCategory('Position'); if (isset($row['affiliation1_title']) && $row['affiliation1_title'] != '') { $description = trim($row['affiliation1_title']); $rel->description1 = $description; if ($description == 'Director' || $description == 'Trustee' || preg_match('/^Chair/su', $description)) { $rel->is_board = 1; $rel->is_employee = 0; } } $rel->save(); $rel->addReference($this->source_url, null, null, $this->source_name); $edit['New Relationship'] = $rel->id; } if (isset($row['start_date']) && trim($row['start_date']) != '') { $edit['Relationship']['start_date'] = trim($row['start_date']); } if (isset($row['end_date']) && trim($row['end_date']) != '') { $edit['Relationship']['end_date'] = trim($row['end_date']); } if (isset($row['title']) && trim($row['title']) != '') { $edit['Relationship']['title'] = trim($row['title']); } if (isset($row['notes']) && trim($row['notes']) != '') { $edit['Relationship']['notes'] = trim($row['notes']); } if (isset($row['rank']) && $row['rank'] != '') { $edit['rank'] = $row['rank']; } $this->db->commit(); } catch (Exception $e) { $this->db->rollback(); throw $e; } $this->edits[] = $edit; }
private function findOrgInfo($org) { $this->printDebug('***'); $this->printDebug($org->name); if ($this->hasMeta($org->id, 'is_complete') && $this->getMeta($org->id, 'is_complete') && !$this->_override) { $this->printDebug("Already fetched lobbying data for Entity " . $org->id . "; skipping..."); return; } $name = OrgTable::removeSuffixes($org->name, $exclude = array('Bancorp')); //$name = preg_replace('/(\p{Ll})(\p{Lu})/e','"$1 $2"', $name); $this->printDebug($name); $terms = preg_split('/[\\s\\.\\-]+/isu', $name, -1, PREG_SPLIT_NO_EMPTY); $q = LsDoctrineQuery::create()->from('LdaClient c'); foreach ($terms as $term) { $q->addWhere('name like ?', '%' . $term . '%'); } $clients = $q->execute(); $client_names = array(); $client_ids = array(); foreach ($clients as $client) { $matched = true; $start = LsString::escapeStringForRegex($terms[0]); if (preg_match('/^' . $start . '\\b/isu', $client->name) == 0 && preg_match('/(\\(for\\s+|on\\s+behalf\\s+of\\s+)' . $start . '\\b/isu', $client->name) == 0) { $matched = false; } $name = $client->name; if (stristr($name, 'pilots') && stristr($name, 'ass')) { $matched = false; } foreach ($terms as $term) { $term = LsString::escapeStringForRegex($term); $new = preg_replace('/((^|\\s)|\\b)' . $term . '(\\b|(\\s|$))/isu', ' ', $name, 1); if ($new == $name) { $matched = false; } $name = $new; } $name = trim(OrgTable::removeSuffixes($name)); if ($matched && strlen($name) > 0 && count(LsString::split($name)) > 2) { //$this->printDebug($name . ' HAS TOO MANY WRONG WORDS*******************************************'); //sleep(1); } if ($matched == true) { //$this->printDebug('Found matching client: ' . $client->name); $client_ids[] = $client->id; $client_names[] = $client->name; } else { //$this->printDebug('Not a match: ' . $client->name . "\n"); } } $client_names = array_unique($client_names); if (count($terms) > 1 || count($client_names) < 30) { /*foreach($client_names as $client_name) { $e = EntityTable::findByAlias($client_name,$context = 'lda_client'); if (!$e || $e->id != $org->id) { $alias = new Alias; $alias->name = $client_name; $alias->Entity = $org; $alias->context = 'lda_client'; $alias->save(); } }*/ foreach ($client_ids as $client_id) { $lda_filings = Doctrine::getTable('LdaFiling')->findByClientId($client_id); foreach ($lda_filings as $lda_filing) { $lf = Doctrine::getTable('LobbyFiling')->findOneByFederalFilingId($lda_filing->federal_filing_id); if (!$lf) { $this->printDebug($lda_filing->id); $this->printDebug(number_format(memory_get_usage())); $this->importFiling($org, $lda_filing); } else { $this->printDebug('Previously imported: ' . $lda_filing->federal_filing_id . "\n"); } } } } else { //$this->printDebug('TOO MANY NAMES**************************'); } // $fh = fopen('lobbying_client_names.csv','a'); // $w = $org->name . "\t" . $org->id . "\t" . implode("\n\t\t", $client_names) . "\n\n"; // fwrite($fh, $w); // fclose($fh); $this->printDebug(count($client_names)); $this->printDebug(implode(', ', $client_names)); }