public function hasSimilarName($str, $strict = false) { $str = OrgTable::removeSuffixes($str); $str = trim($str); if (!strlen($str)) { return false; } $terms = LsQuery::splitSearchPhrase($str); $matched = false; $names = $this->Entity->getAllNames(); foreach ($terms as &$term) { if (is_array($term)) { foreach ($term as &$t) { $t = LsString::escapeStringForRegex($t); } $term = implode('|', $term); } else { $term = LsString::escapeStringForRegex($term); } } unset($term); if ($terms[0] == 'The') { array_shift($terms); } foreach ($names as $name) { $matched = true; if (!preg_match('/^(The\\s+)?(' . $terms[0] . ')/isu', $name)) { $matched = false; continue; } foreach ($terms as $term) { $new = preg_replace('/((^|\\s)|\\b)(' . $term . ')(\\b|(\\s|$))/isu', ' ', $name, 1); if ($new == $name) { $matched = false; continue; } $name = $new; } $name = trim(OrgTable::removeSuffixes($name)); if ($strict && $matched && strlen($name) > 0 && count(LsString::split($name)) >= $strict) { $matched = false; } if ($matched == true) { break; } } return $matched; }
private function findPersonBio($page, $person, $org) { //$this->printDebug(''); $name_re = LsString::escapeStringForRegex($person->name_last); if (preg_match('/<title>([^<]*)<\\/title>/is', $page, $match)) { if (stristr($match[1], $person->name_last) && stristr($match[1], $person->name_first) && strlen($person->name_first) > 2) { $name_re .= '|' . LsString::escapeStringForRegex($person->name_first); } } $layout_tags = implode('|', LsHtml::$layoutTags); $re2 = '/>([^<]*?(' . $name_re . ')(\\s|,|<)(.*?))<\\/?(' . $layout_tags . ')/is'; $re = $re2 . 'u'; //$this->printDebug($re); $bio_match = null; if (preg_match_all($re, $page, $matches) || preg_match_all($re2, $page, $matches)) { //$this->printDebug('matches found'); $arr = array(); $most_reqs = 0; $qual = false; $news = false; foreach ($matches[1] as $match) { if (stristr($match, '}') || stristr($match, '{') || preg_match('/\\svar\\s/is', $match)) { //$this->printDebug('FAILED - curly brackets'); continue; } $str = LsHtml::replaceEntities($match); $str = LsHtml::stripTags($str, ''); $str = trim(LsString::spacesToSpace($str)); $this->printDebug(strlen($str)); if (strlen($str) > 3000) { $this->printDebug('FAILED - str too long'); continue; } if (preg_match('/(^|\\b)(' . $name_re . ')\\b/is', $str) == 0) { $this->printDebug($match . 'FAILED - no name match'); continue; } $word_count = count(explode(' ', $str)); if ($word_count < 12) { $this->printDebug('FAILED - str not long enough'); continue; } else { if (stristr($str, 'announce') || stristr($str, 'today') || stristr($str, '—') || stristr($str, '–') || preg_match('/^[^\\-]{0,100}\\-(\\-|\\s)/is', $str)) { $news = true; $this->printDebug('FAILED: dash / announced / today'); } else { if (preg_match('/(^|\\s)([\'"”])([^\\1]+)\\1/is', $str, $qm) && count(explode(' ', $qm[0])) > 6) { $news = true; $this->printDebug('FAILED: quote'); } else { if (preg_match_all('/\\s(\\p{Ll})+\\b/su', $str, $lcm) < 5) { $this->printDebug('FAILED: not enough lowercase'); } else { $bio_words = PersonTable::$commonBioWords; if (in_array('Lobbyist', $person->getExtensions())) { $bio_words = array_merge($bio_words, LobbyistTable::$commonBioWords); } $bio_words = implode('|', $bio_words); $bio_word_ct = preg_match_all('/\\s(' . $bio_words . ')\\s/is', $str, $matches); $str = trim($str); if (preg_match('/\\.$/is', $str) == 0) { $this->printDebug('no period at end of string'); } else { if ($bio_word_ct > 1) { $news = false; $qual = true; $arr[] = $str; } else { $this->printDebug('less than 2 bio words'); if ($news == false) { $str = preg_replace('/^[\\,\\.\\:\\;]\\s*/su', '', $str); $arr[] = $str; //array('str' => $str, 'bio_words' => $bio_word_ct); } } } } } } //$this->printDebug(''); } } if ($qual) { $arr = array_unique($arr); $ret = false; $bio = implode("\n\n", $arr); //$this->printDebug($name_re); if (strlen($bio) < 3000 && LsString::withinN($bio, '(' . $name_re . ')', '(is|was|holds|led|has|had|provides|practices|served|leads)', 2)) { if (preg_match('/^.*?\\b(' . $name_re . ')\\b/is', $bio, $m) && count(explode(' ', $m[0])) < 20) { $ret = true; $this->printDebug('SUCCESS'); } } else { $this->printDebug('within N failed !!!!'); } $org_test = true; if ($ret && stristr($org->name, $person->name_last)) { $org_test = false; if (strlen($person->name_first) > 1) { if (preg_match('/([^\\s]+\\s+){0,14}/is', $arr[0], $beg_match)) { $nf_re = LsString::escapeStringForRegex($person->name_first); if (preg_match('/\\b' . $nf_re . '\\b/is', $beg_match[0]) || preg_match('/\\b(Mr|Mrs|Ms)\\b/su', $arr[0])) { $org_test = true; //$this->printDebug('PASSED FIRST NAME TEST'); } } } else { if (preg_match('/\\b(he|she|him|her|his|mr|ms|mrs)\\b/is', $arr[0])) { $org_test = true; //$this->printDebug('PASSED POSSESSIVE TEST'); } } } if ($ret && $org_test) { return $bio; } } } else { $this->printDebug('no matches found'); } return false; }
protected function importGovernor($row) { $url = $this->_baseUrl . $row['url']; if (!$this->browser->get($url)->responseIsError()) { $text = $this->browser->getResponseText(); $text = LsHtml::replaceEntities($text); //preg_match('/>Family\:<\/b>([^<]*)<br/is',$text,$family_arr); $name = trim(str_ireplace('Gov.', '', $row['name'])); $this->printDebug(''); $this->printDebug($name . ':'); $governor = PersonTable::parseFlatName($name); $governor->addExtension('PoliticalCandidate'); $governor->addExtension('ElectedRepresentative'); $governor->is_state = 1; $similar = $governor->getSimilarEntitiesQuery(true)->execute(); foreach ($similar as $s) { $sim_re = LsString::escapeStringForRegex($s->name_first); $search_re = LsString::escapeStringForRegex($governor->name_first); if (preg_match('/^' . $sim_re . '/su', $governor->name_first) == 0 && preg_match('/^' . $search_re . '/su', $s->name_first) == 0) { continue; } $bio = $s->getExtendedBio(); if (preg_match('/\\bgovernor(ship)?\\b/isu', $bio)) { $governor = $s; $this->printDebug(' Found existing governor: ' . $s->name . ' ' . $s->id); break; } } $governor->save(); $this->printDebug($governor->id); if (!$governor->start_date && preg_match('/>Born\\:<\\/b>([^<]*)<br/is', $text, $birth_arr)) { $this->printDebug(' Birthdate: ' . $birth_arr[1]); $governor->start_date = trim($birth_arr[1]); } if (!$governor->birthplace && preg_match('/>Birth State\\:<\\/b>([^<]*)<br/is', $text, $birth_state_arr)) { $this->printDebug(' Birthplace: ' . trim($birth_state_arr[1])); $governor->birthplace = trim($birth_state_arr[1]); } //PARTY MEMBERSHIP if (preg_match('/>Party\\:<\\/b>([^<]*)<br/is', $text, $party_arr)) { $party_str = $party_arr[1]; $this->printDebug(' Party: ' . $party_str); if (stristr($party_str, 'Democrat')) { $party = EntityTable::getByExtensionQuery('PoliticalParty')->addWhere('name = ?', 'Democratic Party')->fetchOne(); } if (stristr($party_str, 'Republican')) { $party = EntityTable::getByExtensionQuery('PoliticalParty')->addWhere('name = ?', 'Republican Party')->fetchOne(); } if (isset($party) && $party && !$governor->party_id) { $governor->Party = $party; $governor->is_independent = false; $this->printDebug(' Added membership in ' . $party); } else { if (stristr($party_str, 'Independent')) { $governor->is_independent = true; } } } if (!$governor->summary && preg_match_all('/>([^<]{240,})/isu', $text, $bio_match)) { $str = ''; foreach ($bio_match[1] as $b) { if (!stristr($b, 'Javascript')) { $str .= "\n\n" . $b; } } $str = trim($str); if (strlen($str)) { $governor->summary = $str; } } $governor->save(); $governor->addReference($url, null, $governor->getAllModifiedFields(), 'Governors Association'); //SCHOOLS if (preg_match('/>School\\(s\\)\\:<\\/b>([^<]*)<br/is', $text, $school_arr)) { $school_names = explode(';', trim($school_arr[1])); if (count($school_names) == 1) { $school_names = explode(',', $school_names[0]); } foreach ($school_names as $school_name) { $school_name = trim($school_name); if (!($school = EntityTable::getByExtensionQuery('School')->leftJoin('e.Alias a')->addWhere('e.name = ? or a.name = ?', array($school_name, $school_name))->fetchOne())) { $school = new Entity(); $school->addExtension('Org'); $school->addExtension('School'); $school->name = $school_name; $school->save(); $this->printDebug(' Added School: ' . $school_name); } $q = RelationshipTable::getByCategoryQuery('Education')->addWhere('entity1_id = ? and entity2_id = ?', array($governor->id, $school->id))->fetchOne(); if (!$q) { $relationship = new Relationship(); $relationship->setCategory('Education'); $relationship->Entity1 = $governor; $relationship->Entity2 = $school; $relationship->is_current = 0; $relationship->save(); $relationship->addReference($url, null, $relationship->getAllModifiedFields(), 'Governors Association'); $this->printDebug(' Added education: ' . $relationship->name); } } } //GOVERNOR OFFICE AND POSITION $office_name = 'Office of the Governor of ' . $row['state']; if (!($office = EntityTable::getByExtensionQuery('GovernmentBody')->addWhere('name = ?', $office_name)->fetchOne())) { $office = new Entity(); $office->name = $office_name; $office->addExtension('Org'); $office->addExtension('GovernmentBody'); $state = Doctrine::getTable('AddressState')->findOneByName($row['state']); if ($state) { $office->state_id = $state->id; } $office->save(); $office->addReference($url, null, $office->getAllModifiedFields(), 'Governors Association'); $this->printDebug(' Added office: ' . $office->name); } $q = RelationshipTable::getByCategoryQuery('Position')->addWhere('entity1_id = ? and entity2_id = ? and description1 = ?', array($governor->id, $office->id, 'Governor'))->fetchOne(); if (!$q) { sort($row['years']); $i = 0; while ($i < count($row['years'])) { $governorship = new Relationship(); $governorship->setCategory('Position'); $governorship->Entity1 = $governor; $governorship->Entity2 = $office; $governorship->description1 = 'Governor'; $governorship->start_date = $row['years'][$i]; $i++; if (isset($row['years'][$i])) { $governorship->end_date = $row['years'][$i]; $governorship->is_current = 0; if (!$governor->blurb && !isset($row['years'][$i + 1])) { $governor->blurb = 'Former Governor of ' . $row['state']; } } else { $governorship->is_current = 1; if (!$governor->blurb) { $governor->blurb = 'Governor of ' . $row['state']; } } $governor->save(); $i++; $governorship->save(); $governorship->addReference($url, null, $governorship->getAllModifiedFields(), 'Governors Association'); $this->printDebug(' Added governorship: ' . $governorship->name); } } //SPOUSE if (preg_match('/>Spouse\\:<\\/b>(.*?)<br/is', $text, $spouse_arr)) { $spouse = trim(LsHtml::stripTags($spouse_arr[1])); $q = RelationshipTable::getByCategoryQuery('Family')->addWhere('entity1_id = ? or entity2_id = ?', array($governor->id, $governor->id))->fetchOne(); if (!$q && strlen($spouse)) { $spouse = PersonTable::parseFlatName($spouse); $spouse->save(); $this->printDebug(' Added spouse: ' . $spouse->name); $relationship = new Relationship(); $relationship->setCategory('Family'); $relationship->Entity1 = $spouse; $relationship->Entity2 = $governor; $relationship->description1 = 'Spouse'; $relationship->description2 = 'Spouse'; $relationship->save(); $relationship->addReference($url, null, $relationship->getAllModifiedFields(), 'Governors Association'); $this->printDebug(' Added spouse relationship: ' . $relationship->name); } } //ADDRESS --not working, malformed addresses /* if (preg_match('/>Address\:\s*<\/b>(.*?)<b>/is',$text,$address_arr)) { $address = trim(str_replace('<br/>',', ',$address_arr[1])); $this->printDebug($address); if ($governor->Address->count() == 0 && $a = $governor->addAddress($address)) { $this->printDebug(' Address: ' . $a); $governor->save(); } }*/ //PHONE NUMBER if (preg_match('/>Phone\\(s\\)\\:<\\/b>([^<]*)<br/is', $text, $phone_arr)) { $phone_number = trim($phone_arr[1]); if (!$governor->Phone->count()) { $phone = $governor->addPhone($phone_number); $this->printDebug(' Phone: ' . $phone); } } if (!$governor->Image->count() && preg_match('/<img .*?class\\="display" src\\="([^"]*)"/is', $text, $img_arr)) { $url = $img_arr[1]; try { $fileName = ImageTable::createFiles($url, $governor->name_first); } catch (Exception $e) { $fileName = null; } if ($fileName) { //insert image record $image = new Image(); $image->filename = $fileName; $image->entity_id = $governor->id; $image->title = $governor->name; $image->caption = 'From Governors Association website'; $image->is_featured = true; $image->is_free = false; $image->url = $url; $image->save(); $this->printDebug("Imported image: " . $image->filename); } } } }
public function getSummary($str, Entity $e) { $str = LsHtml::replaceEntities($str); $name_re = array(); $name_re[] = $e->getNameRegex(); if ($e->name_nick && $e->name_nick != '') { $name_re[] = LsString::escapeStringForRegex($e->name_nick); } $name_re = implode('|', $name_re); $style_tags = implode('|', LsHtml::$fontStyleTags); $layout_tags = implode('|', LsHtml::$layoutTags); $re = '/((' . $name_re . ')(.*?))<\\/?(' . $layout_tags . ')/isu'; $this->printDebug($re); $results = null; if (preg_match_all($re, $str, $matches)) { $results = $matches[1]; foreach ($results as $result) { $result = LsString::spacesToSpace(LsHtml::stripTags($result)); $this->printDebug($result); } } return $results; }
protected function processRow($row) { foreach ($row as &$r) { $r = trim($r); } $edit = array('Search Name' => $row['name'], 'Affiliation Name' => $row['affiliation1'], 'Similar Names' => array(), 'New Person' => null, 'Existing Person' => null, 'New Org' => null, 'Existing Org' => null, 'New Relationship' => null); try { $this->db->beginTransaction(); $person = null; $search_person = PersonTable::parseFlatName($row['name']); $similar = $search_person->getSimilarEntitiesQuery(true)->execute(); $matched_bio = false; $similar_ids = array(); foreach ($similar as $s) { $similar_ids[] = $s->id; $sim_re = LsString::escapeStringForRegex($s->name_first); $search_re = LsString::escapeStringForRegex($search_person->name_first); if (preg_match('/^' . $sim_re . '/su', $search_person->name_first) == 0 && preg_match('/^' . $search_re . '/su', $s->name_first) == 0) { continue; } $matched = false; $affils = array(); $ct = 1; $matched_affils = array(); $unmatched_affils = array(); while (isset($row['affiliation' . $ct]) && trim($row['affiliation' . $ct]) != '') { $affil = trim($row['affiliation' . $ct]); $org = $s->checkAffiliations(array($affil)); if ($org) { $matched_affils[] = array($org, $affil); $edit['Existing Org'] = $org->id; break; } else { $unmatched_affils[] = $affil; } $ct++; } if (count($matched_affils)) { $person = $s; break; //$ret[] = array('person' => $s, $matched_affils, $unmatched_affils); } else { /*$str = implode(' ', $unmatched_affils); if (isset($row['bio'])) { $str .= ' ' . $row['bio']; }*/ $bio = $s->getExtendedBio(); foreach ($unmatched_affils as $affil) { $affil = OrgTable::removeSuffixes($affil); $this->printDebug($affil); $this->printDebug($bio); if (preg_match('/' . OrgTable::getNameRegex($affil) . '/su', $bio)) { $matched_bio = true; break; } } if ($matched_bio) { $person = $s; break; } else { $this->printDebug(' ' . $s->name . ' failed'); } } } $edit['Similar Names'] = array_slice($similar_ids, 0, 5); $no_match = false; if (!$person) { if (isset($row['bio']) && trim($row['bio']) != '') { $search_person->summary = $row['bio']; } $search_person->save(); $this->printDebug(' not found, new person saved: ' . $search_person->name); $search_person->addReference($this->source_url, null, null, $this->source_name); $no_match = true; $edit['New Person'] = $search_person->id; $person = $search_person; } else { if (isset($row['bio']) && trim($row['bio']) != '' && !$person->summary) { $person->summary = $row['bio']; $person->save(); } $this->printDebug(' **person found: ' . $person->name); $edit['Existing Person'] = $person->id; } if ($matched_bio || $no_match) { $orgs = OrgTable::getOrgsWithSimilarNames($row['affiliation1'], true); $max = -1; $affiliated_org = null; foreach ($orgs as $org) { $this->printDebug(' found match: ' . $org->name); $ct = $org->getRelatedEntitiesQuery('Person', RelationshipTable::POSITION_CATEGORY, null, null, null, false, 2)->count(); if ($ct > $max) { $affiliated_org = $org; $edit['Existing Org'] = $affiliated_org->id; $max = $ct; } } if (!$affiliated_org) { $affiliated_org = new Entity(); $affiliated_org->addExtension('Org'); if (isset($row['affiliation1_extensions']) && $row['affiliation1_extensions'] != '') { $extensions = explode(',', $row['affiliation1_extensions']); foreach ($extensions as $ext) { $ext = trim($ext); if (in_array($ext, ExtensionDefinitionTable::$extensionNames)) { $affiliated_org->addExtension($ext); } } } else { //$affiliated_org->addExtension('Business'); } $affiliated_org->name = $row['affiliation1']; $affiliated_org->save(); $affiliated_org->addReference($this->source_url, null, null, $this->source_name); $edit['New Org'] = $affiliated_org->id; } $rel = new Relationship(); $rel->Entity1 = $person; $rel->Entity2 = $affiliated_org; $rel->setCategory('Position'); if (isset($row['affiliation1_title']) && $row['affiliation1_title'] != '') { $description = trim($row['affiliation1_title']); $rel->description1 = $description; if ($description == 'Director' || $description == 'Trustee' || preg_match('/^Chair/su', $description)) { $rel->is_board = 1; $rel->is_employee = 0; } } $rel->save(); $rel->addReference($this->source_url, null, null, $this->source_name); $edit['New Relationship'] = $rel->id; } if (isset($row['start_date']) && trim($row['start_date']) != '') { $edit['Relationship']['start_date'] = trim($row['start_date']); } if (isset($row['end_date']) && trim($row['end_date']) != '') { $edit['Relationship']['end_date'] = trim($row['end_date']); } if (isset($row['title']) && trim($row['title']) != '') { $edit['Relationship']['title'] = trim($row['title']); } if (isset($row['notes']) && trim($row['notes']) != '') { $edit['Relationship']['notes'] = trim($row['notes']); } if (isset($row['rank']) && $row['rank'] != '') { $edit['rank'] = $row['rank']; } $this->db->commit(); } catch (Exception $e) { $this->db->rollback(); throw $e; } $this->edits[] = $edit; }
static function getNameRegex($name) { $name = OrgTable::removeSuffixes($name); $name = LsString::escapeStringForRegex($name); $name = str_replace(',', ',?', $name); $name = str_replace('\\s+', ',?\\s+', $name); $name = str_replace('&', '(&|\\s*and\\s*)', $name); return $name; }
private function findOrgInfo($org) { $this->printDebug('***'); $this->printDebug($org->name); if ($this->hasMeta($org->id, 'is_complete') && $this->getMeta($org->id, 'is_complete') && !$this->_override) { $this->printDebug("Already fetched lobbying data for Entity " . $org->id . "; skipping..."); return; } $name = OrgTable::removeSuffixes($org->name, $exclude = array('Bancorp')); //$name = preg_replace('/(\p{Ll})(\p{Lu})/e','"$1 $2"', $name); $this->printDebug($name); $terms = preg_split('/[\\s\\.\\-]+/isu', $name, -1, PREG_SPLIT_NO_EMPTY); $q = LsDoctrineQuery::create()->from('LdaClient c'); foreach ($terms as $term) { $q->addWhere('name like ?', '%' . $term . '%'); } $clients = $q->execute(); $client_names = array(); $client_ids = array(); foreach ($clients as $client) { $matched = true; $start = LsString::escapeStringForRegex($terms[0]); if (preg_match('/^' . $start . '\\b/isu', $client->name) == 0 && preg_match('/(\\(for\\s+|on\\s+behalf\\s+of\\s+)' . $start . '\\b/isu', $client->name) == 0) { $matched = false; } $name = $client->name; if (stristr($name, 'pilots') && stristr($name, 'ass')) { $matched = false; } foreach ($terms as $term) { $term = LsString::escapeStringForRegex($term); $new = preg_replace('/((^|\\s)|\\b)' . $term . '(\\b|(\\s|$))/isu', ' ', $name, 1); if ($new == $name) { $matched = false; } $name = $new; } $name = trim(OrgTable::removeSuffixes($name)); if ($matched && strlen($name) > 0 && count(LsString::split($name)) > 2) { //$this->printDebug($name . ' HAS TOO MANY WRONG WORDS*******************************************'); //sleep(1); } if ($matched == true) { //$this->printDebug('Found matching client: ' . $client->name); $client_ids[] = $client->id; $client_names[] = $client->name; } else { //$this->printDebug('Not a match: ' . $client->name . "\n"); } } $client_names = array_unique($client_names); if (count($terms) > 1 || count($client_names) < 30) { /*foreach($client_names as $client_name) { $e = EntityTable::findByAlias($client_name,$context = 'lda_client'); if (!$e || $e->id != $org->id) { $alias = new Alias; $alias->name = $client_name; $alias->Entity = $org; $alias->context = 'lda_client'; $alias->save(); } }*/ foreach ($client_ids as $client_id) { $lda_filings = Doctrine::getTable('LdaFiling')->findByClientId($client_id); foreach ($lda_filings as $lda_filing) { $lf = Doctrine::getTable('LobbyFiling')->findOneByFederalFilingId($lda_filing->federal_filing_id); if (!$lf) { $this->printDebug($lda_filing->id); $this->printDebug(number_format(memory_get_usage())); $this->importFiling($org, $lda_filing); } else { $this->printDebug('Previously imported: ' . $lda_filing->federal_filing_id . "\n"); } } } } else { //$this->printDebug('TOO MANY NAMES**************************'); } // $fh = fopen('lobbying_client_names.csv','a'); // $w = $org->name . "\t" . $org->id . "\t" . implode("\n\t\t", $client_names) . "\n\n"; // fwrite($fh, $w); // fclose($fh); $this->printDebug(count($client_names)); $this->printDebug(implode(', ', $client_names)); }
private function parseBlurb($info, $age_match) { if (count($info['blurb_arr']) == 0) { return $info; } $id = $age_match['name_match']['id']; $person = Doctrine::getTable('Entity')->find($id); $name_words = explode(' ', $person->name); $skip = array('director', 'directors', 'since', 'board', $info['since'], $age_match['age'], 'age'); $skip = array_merge($skip, $name_words); $new = array(); foreach ($info['blurb_arr'] as $b) { $n = $b; foreach ($skip as $s) { $s = LsString::escapeStringForRegex($s); $n = preg_replace('/\\b' . $s . '\\b/isu', '', $n); } $n = preg_replace('/\\b\\d\\d\\d\\d\\b/', '', $n); $n = LsString::stripNonAlpha($n, ' '); $words = preg_split('/\\s+/s', $n); if (count($words) > 3) { $new[] = $b; } } if (count($new) > 0) { $blurb = implode(' ', $new); $blurb_parts = preg_split('/\\s+/s', $blurb); $skip = array_merge($skip, array('executive', 'vice', 'president', 'chief', 'chairman', 'of', 'the')); $n = $blurb; foreach ($skip as $s) { $s = LsString::escapeStringForRegex($s); $n = preg_replace('/\\b' . $s . '\\b/isu', '', $n); } $n = preg_replace('/\\b\\d\\d\\d\\d\\b/', '', $n); $n = LsString::stripNonAlpha($n, ' '); $words = preg_split('/\\s+/s', $n); if (count($words) > 4) { $info['blurb'] = $blurb; } } return $info; }
static function getNameWithLast($str, $last) { $re_last = LsString::escapeStringForRegex($last); //hyphens and spaces interchangeable in last names $re_last = preg_replace('/\\\\s+|\\\\\\-/is', '(\\s+|\\-)', $re_last); $matches = array(); $matched = preg_match_all('/\\b' . $re_last . '\\b/isu', $str, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); $name = null; foreach ($matches as $match) { $pos_last = $match[0][1]; if ($pos_last == 0) { return null; } $last = $match[0][0]; //work backwards from last name to find comma $pos = -1 * (strlen($str) - $pos_last); $comma = strripos($str, ',', $pos); $str = substr($str, $comma); $splat = preg_split('/\\b' . $re_last . '\\b/is', $str); $pre = $splat[0]; $post = $splat[1]; $arr = array_reverse(preg_split('/[\\s]+/', $pre, -1, PREG_SPLIT_NO_EMPTY)); $new = array(); foreach ($arr as $a) { if ($case = LsString::checkCase($a)) { if ($case == 'initial') { $new[] = $a; } else { if ($case == 'lower') { break; } else { if (preg_match('/\\.(\\P{L})*$/u', $a) == 1) { $a = LsString::stripNonAlpha($a); if ($s = LsArray::inArrayNoCase($a, PersonTable::$nameParsePrefixes)) { $new[] = $s; } break; } else { $new[] = $a; } } } } } $pre = implode(' ', array_reverse($new)); if (strlen(trim($pre)) == 0) { continue; } $arr = preg_split('/[\\s]+/', $post, -1, PREG_SPLIT_NO_EMPTY); $new = array(); foreach ($arr as $a) { if ($case = LsString::checkCase($a)) { if ($case == 'lower') { break; } $a = LsString::stripNonAlpha($a); if ($s = LsArray::inArrayNoCase($a, PersonTable::$nameParseSuffixes)) { $new[] = $s; } else { break; } } } $post = trim(implode(' ', $new)); $full = $pre . ' ' . $last; if (strlen($post) > 0) { $full .= ', ' . $post; } $name = array('nameFull' => $full, 'nameStart' => $pre, 'nameLast' => $last, 'namePost' => $post); } return $name; }
public function parseDescriptionStr($str, $corp) { $descriptions = array(); $remains = array(); //cleanup text to be parsed $str = trim($str); $str = str_replace('.', ' ', $str); $str = preg_replace('/\\s{2,}/', ' ', $str); $name_re = LsString::escapeStringForRegex($corp->name); $str = preg_replace('/\\b' . $name_re . '\\b/isu', '', $str); if ($corp->name_nick) { $nick_re = LsString::escapeStringForRegex($corp->name_nick); $str = preg_replace('/\\b' . $nick_re . '\\b/isu', '', $str); } if ($corp->ticker) { $tick_re = LsString::escapeStringForRegex($corp->ticker); $str = preg_replace('/\\b' . $tick_re . '\\b/isu', '', $str); } //split by commas $parts = preg_split('/,|;|\\band\\b|(?<!C[Oo])\\-|\\bAND\\b|\\s&\\s|\\//', $str, -1, PREG_SPLIT_NO_EMPTY); foreach ($parts as $part) { $part = trim($part); $part = preg_replace('/\\s{2,}/', ' ', $part); //abbreviation replacements $part = preg_replace('/( |^)(\\w) (\\w) (\\w)( |$)/', '\\2\\3\\4', $part); $part = preg_replace('/(Interim|Acting|Incoming) /i', '', $part); $part = preg_replace('/Sr /i', 'Senior ', $part); $part = preg_replace('/Chf /i', 'Chief ', $part); $part = preg_replace('/( |^)V( |$)/i', ' Vice ', $part); $part = preg_replace('/( |^)VP( |$)/i', ' Vice President ', $part); $part = preg_replace('/( |^)VC( |$)/i', ' Vice Chairman ', $part); $part = preg_replace('/( |^)Chr( |$)/i', ' Chairman ', $part); $part = preg_replace('/( |^)Ofcr( |$)/i', ' Officer ', $part); $part = preg_replace('/( |^)Vice P( |$)/i', ' Vice President ', $part); $part = preg_replace('/( |^)(Ex|Exec)( |$)/i', ' Executive ', $part); $part = preg_replace('/( |^)EVP( |$)/i', ' Executive Vice President ', $part); $part = preg_replace('/( |^)(Off|Offic|Offcr)( |$)/i', ' Officer ', $part); $part = str_replace('Gen ', 'General ', $part); $part = preg_replace('/( |^)(Op|Oper) /', ' Operating ', $part); $part = preg_replace('/( |^)(Bd|Brd)( |$)/i', ' Board ', $part); $part = preg_replace('/of Board/i', ' of the Board', $part); $part = preg_replace('/( |^)COB( |$)/i', ' Chairman of the Board ', $part); $part = preg_replace('/( |^)(Pres|Prs|Presid|Prsdt|Prsdnt)( |$)/i', ' President ', $part); $part = preg_replace('/( |^)Admin( |$)/i', ' Administrative ', $part); $part = preg_replace('/( |^)Info( |$)/i', ' Information ', $part); $part = preg_replace('/\\bComm\\b/i', 'Committee', $part); $part = preg_replace('/\\bInc\\b/i', '', $part); $part = preg_replace('/( |-|^)(Ch|Chm|Chmn|Chrm|Chrmn|Chair|Chairmain|Chariman)( |$)/i', '\\1Chairman ', $part); $part = preg_replace('/(Sec|Secr|Secy|Secretar|Secreta)( |$)/i', 'Secretary ', $part); $part = str_replace('Vice-', 'Vice ', $part); $part = preg_replace('/( |^)Non /i', ' Non-', $part); $part = preg_replace('/\\bCompl\\b/i', 'Compliance', $part); $part = str_ireplace('of Advisory', 'of the Advisory', $part); $part = preg_replace('/Advisory (Panel|Council)/i', 'Advisory Board', $part); $part = str_ireplace('Independent ', '', $part); $part = str_ireplace('Lead ', '', $part); $part = str_ireplace('Corporate ', '', $part); $part = str_ireplace('Outside ', '', $part); $part = str_ireplace('Non-interested', '', $part); $part = str_ireplace('Interested', '', $part); $part = str_replace('Main ', '', $part); $part = str_ireplace('Presiding ', '', $part); $part = str_ireplace('Founding ', '', $part); $part = str_ireplace('Acctg', 'Accounting', $part); $part = str_ireplace('Chairperson', 'Chairman', $part); $part = str_ireplace('Chairwoman', 'Chairman', $part); $part = str_ireplace("Gen'l", 'General', $part); $part = trim($part); $part = preg_replace('/\\s{2,}/', ' ', $part); $position = array('description' => null, 'note' => array()); if ($part != '') { //look for matching title $p = LsArray::inArrayNoCase($part, PositionTable::$businessPositions); if ($p) { $position['description'] = $p; } else { if ($q = Doctrine::getTable('Relationship')->findOneByDescription1($position)) { $position['description'] = $q->description1; } else { if (count($descriptions) == 0) { $part_splat = LsString::split($part); $note = array(); //$this->printDebug($part); //var_dump($part_splat); $lim = count($part_splat) - 1; for ($i = 0; $i < $lim; $i++) { $note[] = array_pop($part_splat); $part_new = implode(' ', $part_splat); if (strtoupper($part_new) == 'DIRECTOR') { break; } $p = LsArray::inArrayNoCase($part_new, PositionTable::$businessPositions); if ($p) { $position['description'] = $p; } else { if ($q = Doctrine::getTable('Relationship')->findOneByDescription1($position)) { $position['description'] = $q->description1; } } } if (!$position['description']) { $position['description'] = $part; } } else { $descriptions[count($descriptions) - 1]['note'][] = $part; } } } if (isset($position['description'])) { $descriptions[] = $position; } } } return $descriptions; }
public function executeAddBulk($request) { $this->checkEntity($request, false, false); $this->reference_form = new ReferenceForm(); $this->reference_form->setSelectObject($this->entity); $this->add_bulk_form = new AddBulkForm(); //get possible default categories $this->categories = LsDoctrineQuery::create()->select('c.name, c.name')->from('RelationshipCategory c')->orderBy('c.id')->fetchAll(PDO::FETCH_KEY_PAIR); array_unshift($this->categories, ''); if ($request->isMethod('post') && in_array($request->getParameter('commit'), array('Begin', 'Continue'))) { if ($request->hasParameter('ref_id')) { $this->ref_id = $request->getParameter('ref_id'); } else { $refParams = $request->getParameter('reference'); $this->reference_form->bind($refParams); $restOfParams = (array) $request->getParameterHolder(); $restOfParams = array_shift($restOfParams); $this->add_bulk_form->bind($restOfParams, $request->getFiles()); if (!$this->reference_form->isValid() || !$this->add_bulk_form->isValid()) { return; } if ($this->ref_id = $refParams['existing_source']) { $ref = Doctrine::getTable('Reference')->find($this->ref_id); $url = $ref->source; } else { $ref = new Reference(); $ref->object_model = 'Entity'; $ref->object_id = $this->entity->id; $ref->source = $refParams['source']; $ref->name = $refParams['name']; $ref->source_detail = $refParams['source_detail']; $ref->publication_date = $refParams['publication_date']; $ref->save(); } $this->ref_id = $ref->id; $this->reference = $ref; } $verify_method = $request->getParameter('verify_method'); if ($this->add_method = $request->getParameter('add_method')) { if ($this->add_method == 'scrape') { //scrape ref url //set names to confirm $browser = new sfWebBrowser(); $entity_types = $request->getParameter('entity_types'); //FIND NAMES AT URL USING COMBO OF OPENCALAIS & LS CUSTOM HTML PARSING if (!$browser->get($ref->source)->responseIsError()) { $text = $browser->getResponseText(); $this->names = LsTextAnalysis::getHtmlEntityNames($text, $entity_types); $text = LsHtml::findParagraphs($text); $this->text = preg_replace('/<[^b][^>]*>/is', " ", $text); $this->confirm_names = true; return; } else { $request->setError('csv', 'problems finding names at that url'); } } else { if ($this->add_method == 'upload') { $file = $this->add_bulk_form->getValue('file'); $filename = 'uploaded_' . sha1($file->getOriginalName()); $extension = $file->getExtension($file->getOriginalExtension()); $filePath = sfConfig::get('sf_temp_dir') . '/' . $filename . $extension; $file->save($filePath); if ($filePath) { if ($spreadsheetArr = LsSpreadsheet::parse($filePath)) { $names = $spreadsheetArr['rows']; if (!in_array('name', $spreadsheetArr['headers'])) { $request->setError('file', 'The file you uploaded could not be parsed properly because there is no "name" column.'); return; } if (in_array('summary', $spreadsheetArr['headers'])) { foreach ($names as &$name) { $name['summary'] = str_replace(array('?', "'"), "'", $name['summary']); $name['summary'] = str_replace(array('?', '?', '"'), '"', $name['summary']); if (isset($name['title'])) { $name['description1'] = $name['title']; } } unset($name); } } else { $request->setError('file', 'The file you uploaded could not be parsed properly.'); return; } } else { $request->setError('file', 'You need to upload a file.'); return; } } else { if ($this->add_method == 'summary') { //parse summary for names $this->text = $this->entity->summary; $entity_types = $request->getParameter('entity_types'); $this->names = LsTextAnalysis::getTextEntityNames($this->text, $entity_types); $this->confirm_names = true; return; } else { if ($this->add_method == 'text') { $manual_names = $request->getParameter('manual_names'); if ($manual_names && $manual_names != "") { $manual_names = preg_split('#[\\r\\n]+#', $manual_names); $manual_names = array_map('trim', $manual_names); $names = array(); foreach ($manual_names as $name) { $names[] = array('name' => $name); } } else { $request->setError('csv', 'You did not add names properly.'); return; } } else { if ($this->add_method == 'db_search') { $this->db_search = true; } } } } } } //intermediate scrape page -- takes confirmed names, builds names arr if ($confirmed_names = $request->getParameter('confirmed_names')) { $restOfParams = (array) $request->getParameterHolder(); $restOfParams = array_shift($restOfParams); $this->add_bulk_form->bind($restOfParams, $request->getFiles()); if (!$this->add_bulk_form->isValid()) { $this->reference = Doctrine::getTable('reference')->find($this->ref_id); $this->names = unserialize(stripslashes($request->getParameter('names'))); $this->confirm_names = true; return; } $names = array(); foreach ($confirmed_names as $cn) { $names[] = array('name' => $cn); } $manual_names = $request->getParameter('manual_names'); if ($manual_names && $manual_names != "") { $manual_names = preg_split('#[\\r\\n]+#', $manual_names); $manual_names = array_map('trim', $manual_names); foreach ($manual_names as $name) { $names[] = array('name' => $name); } } } // LOAD IN RELATIONSHIP DEFAULTS if (isset($verify_method)) { $defaults = $request->getParameter('relationship'); if ($verify_method == 'enmasse') { $this->default_type = $request->getParameter('default_type'); $this->order = $request->getParameter('order'); $category_name = $request->getParameter('relationship_category_all'); $this->extensions = ExtensionDefinitionTable::getByTier(2, $this->default_type); $extensions_arr = array(); foreach ($this->extensions as $ext) { $extensions_arr[] = $ext->name; } } else { $category_name = $request->getParameter('relationship_category_one'); } if ($category_name) { $this->category_name = $category_name; if (!($category = Doctrine::getTable('RelationshipCategory')->findOneByName($category_name))) { $request->setError('csv', 'You did not select a relationship category.'); return; } $formClass = $category_name . 'Form'; $categoryForm = new $formClass(new Relationship()); $categoryForm->setDefaults($defaults); $this->form_schema = $categoryForm->getFormFieldSchema(); if (in_array($category_name, array('Position', 'Education', 'Membership', 'Donation', 'Lobbying', 'Ownership'))) { $this->field_names = array('description1', 'start_date', 'end_date', 'is_current'); } else { $this->field_names = array('description1', 'description2', 'start_date', 'end_date', 'is_current'); } $extraFields = array('Position' => array('is_board', 'is_executive'), 'Education' => array('degree_id'), 'Donation' => array('amount'), 'Transaction' => array('amount'), 'Lobbying' => array('amount'), 'Ownership' => array('percent_stake', 'shares')); if (isset($extraFields[$category_name])) { $this->field_names = array_merge($this->field_names, $extraFields[$category_name]); } } $this->matches = array(); // BOOT TO TOOLBAR OR LOOK FOR MATCHES FOR ENMASSE ADD if (isset($names) && count($names) > 0 || isset($this->db_search)) { if ($verify_method == 'onebyone') { if (isset($category_name)) { $defaults['category'] = $category_name; } $toolbar_names = array(); foreach ($names as $name) { $toolbar_names[] = $name['name']; } $this->getUser()->setAttribute('toolbar_names', $toolbar_names); $this->getUser()->setAttribute('toolbar_entity', $this->entity->id); $this->getUser()->setAttribute('toolbar_defaults', $defaults); $this->getUser()->setAttribute('toolbar_ref', $this->ref_id); $this->redirect('relationship/toolbar'); } else { $this->category_name = $category_name; if (isset($this->db_search)) { $num = $request->getParameter('num', 10); $page = $request->getParameter('page', 1); $q = LsDoctrineQuery::create()->from('Entity e')->where('(e.summary rlike ? or e.blurb rlike ?)', array('[[:<:]]' . $this->entity->name . '[[:>:]]', '[[:<:]]' . $this->entity->name . '[[:>:]]')); foreach ($this->entity->Alias as $alias) { $q->orWhere('(e.summary rlike ? or e.blurb rlike ?)', array('[[:<:]]' . $alias->name . '[[:>:]]', '[[:<:]]' . $alias->name . '[[:>:]]')); } $q->setHydrationMode(Doctrine::HYDRATE_ARRAY); $cat_id = constant('RelationshipTable::' . strtoupper($category_name) . '_CATEGORY'); $q->whereParenWrap(); $q->andWhere('NOT EXISTS (SELECT DISTINCT l.relationship_id FROM Link l ' . 'WHERE l.entity1_id = e.id AND l.entity2_id = ? AND l.category_id = ?)', array($this->entity['id'], $cat_id)); $summary_matches = $q->execute(); foreach ($summary_matches as $summary_match) { $aliases = array(); foreach ($this->entity->Alias as $alias) { $aliases[] = LsString::escapeStringForRegex($alias->name); } $aliases = implode("|", $aliases); $summary_match['summary'] = preg_replace('/(' . $aliases . ')/is', '<strong>$1</strong>', $summary_match['summary']); $this->matches[] = array('search_results' => array($summary_match)); } } else { for ($i = 0; $i < count($names); $i++) { if (isset($names[$i]['name']) && trim($names[$i]['name']) != '') { $name = $names[$i]['name']; $name_terms = $name; if ($this->default_type == 'Person') { $name_parts = preg_split('/\\s+/', $name); if (count($name_parts) > 1) { $name_terms = PersonTable::nameSearch($name); } $terms = $name_terms; $primary_ext = "Person"; } else { if ($this->default_type == 'Org') { $name_terms = OrgTable::nameSearch($name); $terms = $name_terms; $primary_ext = "Org"; } else { $terms = $name_terms; $primary_ext = null; } } $pager = EntityTable::getSphinxPager($terms, $page = 1, $num = 20, $listIds = null, $aliases = true, $primary_ext); $match = $names[$i]; $match['search_results'] = $pager->execute(); if (isset($names[$i]['types'])) { $types = explode(',', $names[$i]['types']); $types = array_map('trim', $types); $match['types'] = array(); foreach ($types as $type) { if (in_array($type, $extensions_arr)) { $match['types'][] = $type; } } } $this->matches[] = $match; } } } } } } } else { if ($page = $this->getRequestParameter('page')) { $this->page = $page; $this->num = $this->getRequestParameter('num', 50); } else { if ($request->isMethod('post') && $request->getParameter('commit') == 'Submit') { $this->ref_id = $this->getRequestParameter('ref_id'); $entity_ids = array(); $relationship_category = $this->getRequestParameter('category_name'); $order = $this->getRequestParameter('order'); $default_type = $request->getParameter('default_type'); $default_ref = Doctrine::getTable('Reference')->find($request->getParameter('ref_id')); for ($i = 0; $i < $this->getRequestParameter('count'); $i++) { if ($entity_id = $request->getParameter('entity_' . $i)) { $selected_entity_id = null; $relParams = $request->getParameter("relationship_" . $i); if ($relParams['ref_name']) { $ref['source'] = $relParams['ref_source']; $ref['name'] = $relParams['ref_name']; } if ($entity_id == 'new') { $name = $request->getParameter('new_name_' . $i); if ($default_type == 'Person') { $new_entity = PersonTable::parseFlatName($name); } else { $new_entity = new Entity(); $new_entity->addExtension('Org'); $new_entity->name = trim($name); } $new_entity->save(); $new_entity->blurb = $request->getParameter('new_blurb_' . $i); $new_entity->summary = $request->getParameter('new_summary_' . $i); if (!$ref) { $ref = $default_ref; } $new_entity->addReference($ref['source'], null, null, $ref['name']); if ($types = $request->getParameter('new_extensions_' . $i)) { foreach ($types as $type) { $new_entity->addExtension($type); } } $new_entity->save(); $selected_entity_id = $new_entity->id; } else { if ($entity_id > 0) { $selected_entity_id = $entity_id; LsCache::clearEntityCacheById($selected_entity_id); } } if ($selected_entity_id) { $startDate = $relParams['start_date']; $endDate = $relParams['end_date']; unset($relParams['start_date'], $relParams['end_date'], $relParams['ref_name'], $relParams['ref_url']); $rel = new Relationship(); $rel->setCategory($relationship_category); if ($order == '1') { $rel->entity1_id = $this->entity['id']; $rel->entity2_id = $selected_entity_id; } else { $rel->entity2_id = $this->entity['id']; $rel->entity1_id = $selected_entity_id; } //only set dates if valid if ($startDate && preg_match('#^\\d{4}-\\d{2}-\\d{2}$#', Dateable::convertForDb($startDate))) { $rel->start_date = Dateable::convertForDb($startDate); } if ($endDate && preg_match('#^\\d{4}-\\d{2}-\\d{2}$#', Dateable::convertForDb($endDate))) { $rel->end_date = Dateable::convertForDb($endDate); } $rel->fromArray($relParams, null, $hydrateCategory = true); if ($request->hasParameter('add_method') && $request->getParameter('add_method') == 'db_search') { $refs = EntityTable::getSummaryReferences($selected_entity_id); if (count($refs)) { $ref = $refs[0]; } else { $refs = EntityTable::getAllReferencesById($selected_entity_id); if (count($refs)) { $ref = $refs[0]; } } } if (!$ref) { $ref = $default_ref; } $rel->saveWithRequiredReference(array('source' => $ref['source'], 'name' => $ref['name'])); $ref = null; } } } $this->clearCache($this->entity); $this->redirect($this->entity->getInternalUrl()); } else { if ($request->isMethod('post') && $request->getParameter('commit') == 'Cancel') { $this->redirect($this->entity->getInternalUrl()); } } } } }