Esempio n. 1
0
 public function getSummary($str, Entity $e)
 {
     $str = LsHtml::replaceEntities($str);
     $name_re = array();
     $name_re[] = $e->getNameRegex();
     if ($e->name_nick && $e->name_nick != '') {
         $name_re[] = LsString::escapeStringForRegex($e->name_nick);
     }
     $name_re = implode('|', $name_re);
     $style_tags = implode('|', LsHtml::$fontStyleTags);
     $layout_tags = implode('|', LsHtml::$layoutTags);
     $re = '/((' . $name_re . ')(.*?))<\\/?(' . $layout_tags . ')/isu';
     $this->printDebug($re);
     $results = null;
     if (preg_match_all($re, $str, $matches)) {
         $results = $matches[1];
         foreach ($results as $result) {
             $result = LsString::spacesToSpace(LsHtml::stripTags($result));
             $this->printDebug($result);
         }
     }
     return $results;
 }
 private function findPersonBio($page, $person, $org)
 {
     //$this->printDebug('');
     $name_re = LsString::escapeStringForRegex($person->name_last);
     if (preg_match('/<title>([^<]*)<\\/title>/is', $page, $match)) {
         if (stristr($match[1], $person->name_last) && stristr($match[1], $person->name_first) && strlen($person->name_first) > 2) {
             $name_re .= '|' . LsString::escapeStringForRegex($person->name_first);
         }
     }
     $layout_tags = implode('|', LsHtml::$layoutTags);
     $re2 = '/>([^<]*?(' . $name_re . ')(\\s|,|<)(.*?))<\\/?(' . $layout_tags . ')/is';
     $re = $re2 . 'u';
     //$this->printDebug($re);
     $bio_match = null;
     if (preg_match_all($re, $page, $matches) || preg_match_all($re2, $page, $matches)) {
         //$this->printDebug('matches found');
         $arr = array();
         $most_reqs = 0;
         $qual = false;
         $news = false;
         foreach ($matches[1] as $match) {
             if (stristr($match, '}') || stristr($match, '{') || preg_match('/\\svar\\s/is', $match)) {
                 //$this->printDebug('FAILED - curly brackets');
                 continue;
             }
             $str = LsHtml::replaceEntities($match);
             $str = LsHtml::stripTags($str, '');
             $str = trim(LsString::spacesToSpace($str));
             $this->printDebug(strlen($str));
             if (strlen($str) > 3000) {
                 $this->printDebug('FAILED - str too long');
                 continue;
             }
             if (preg_match('/(^|\\b)(' . $name_re . ')\\b/is', $str) == 0) {
                 $this->printDebug($match . 'FAILED - no name match');
                 continue;
             }
             $word_count = count(explode(' ', $str));
             if ($word_count < 12) {
                 $this->printDebug('FAILED - str not long enough');
                 continue;
             } else {
                 if (stristr($str, 'announce') || stristr($str, 'today') || stristr($str, '—') || stristr($str, '–') || preg_match('/^[^\\-]{0,100}\\-(\\-|\\s)/is', $str)) {
                     $news = true;
                     $this->printDebug('FAILED: dash / announced / today');
                 } else {
                     if (preg_match('/(^|\\s)([\'"”])([^\\1]+)\\1/is', $str, $qm) && count(explode(' ', $qm[0])) > 6) {
                         $news = true;
                         $this->printDebug('FAILED: quote');
                     } else {
                         if (preg_match_all('/\\s(\\p{Ll})+\\b/su', $str, $lcm) < 5) {
                             $this->printDebug('FAILED: not enough lowercase');
                         } else {
                             $bio_words = PersonTable::$commonBioWords;
                             if (in_array('Lobbyist', $person->getExtensions())) {
                                 $bio_words = array_merge($bio_words, LobbyistTable::$commonBioWords);
                             }
                             $bio_words = implode('|', $bio_words);
                             $bio_word_ct = preg_match_all('/\\s(' . $bio_words . ')\\s/is', $str, $matches);
                             $str = trim($str);
                             if (preg_match('/\\.$/is', $str) == 0) {
                                 $this->printDebug('no period at end of string');
                             } else {
                                 if ($bio_word_ct > 1) {
                                     $news = false;
                                     $qual = true;
                                     $arr[] = $str;
                                 } else {
                                     $this->printDebug('less than 2 bio words');
                                     if ($news == false) {
                                         $str = preg_replace('/^[\\,\\.\\:\\;]\\s*/su', '', $str);
                                         $arr[] = $str;
                                         //array('str' => $str, 'bio_words' => $bio_word_ct);
                                     }
                                 }
                             }
                         }
                     }
                 }
                 //$this->printDebug('');
             }
         }
         if ($qual) {
             $arr = array_unique($arr);
             $ret = false;
             $bio = implode("\n\n", $arr);
             //$this->printDebug($name_re);
             if (strlen($bio) < 3000 && LsString::withinN($bio, '(' . $name_re . ')', '(is|was|holds|led|has|had|provides|practices|served|leads)', 2)) {
                 if (preg_match('/^.*?\\b(' . $name_re . ')\\b/is', $bio, $m) && count(explode(' ', $m[0])) < 20) {
                     $ret = true;
                     $this->printDebug('SUCCESS');
                 }
             } else {
                 $this->printDebug('within N failed !!!!');
             }
             $org_test = true;
             if ($ret && stristr($org->name, $person->name_last)) {
                 $org_test = false;
                 if (strlen($person->name_first) > 1) {
                     if (preg_match('/([^\\s]+\\s+){0,14}/is', $arr[0], $beg_match)) {
                         $nf_re = LsString::escapeStringForRegex($person->name_first);
                         if (preg_match('/\\b' . $nf_re . '\\b/is', $beg_match[0]) || preg_match('/\\b(Mr|Mrs|Ms)\\b/su', $arr[0])) {
                             $org_test = true;
                             //$this->printDebug('PASSED FIRST NAME TEST');
                         }
                     }
                 } else {
                     if (preg_match('/\\b(he|she|him|her|his|mr|ms|mrs)\\b/is', $arr[0])) {
                         $org_test = true;
                         //$this->printDebug('PASSED POSSESSIVE TEST');
                     }
                 }
             }
             if ($ret && $org_test) {
                 return $bio;
             }
         }
     } else {
         $this->printDebug('no matches found');
     }
     return false;
 }
Esempio n. 3
0
 static function parseNyDonations($str)
 {
     $re2 = '/(<td.*?>(.*?\\s).*?<.td>\\s*)*?<.tr>/is';
     preg_match_all($re2, $str, $matches);
     $results = array();
     foreach ($matches[0] as $match) {
         $result = array("name" => "", "street" => "", "city" => "");
         $arr = preg_split('/<.td>\\s*<td.*?>/is', $match);
         $name_parts = preg_split('/<br>/is', $arr[0]);
         if (count($name_parts) > 1 && count($arr) > 5) {
             if (preg_match('/(inc|llp|llc|p\\.c\\.|pc)\\.?$/is', $name_parts[0], $match)) {
                 $result['name'] = $name_parts[0];
             } else {
                 $np = preg_split('/\\,\\s*/is', $name_parts[0]);
                 if (count($np) > 1) {
                     if (count($np) == 3 && stripos($np[2], "jr") !== 0) {
                         $result['name'] = $np[2] . " " . $np[0] . ", " . $np[1];
                     } else {
                         $result['name'] = $np[1] . " " . $np[0];
                     }
                 } else {
                     $result['name'] = $np[0];
                 }
             }
             if (count($name_parts) > 1) {
                 $result['street'] = $name_parts[1];
                 if (count($name_parts) > 2) {
                     $result['city'] = $name_parts[2];
                 }
             }
             $result['amount'] = $arr[1];
             $result['date'] = $arr[2];
             $result['committee'] = $arr[3];
             foreach ($result as &$r) {
                 $r = preg_replace('/(\\n|(<.*?>))/is', "", $r);
                 $r = trim($r);
                 $r = LsString::spacesToSpace($r);
             }
             unset($r);
             $results[] = $result;
         }
     }
     $str = implode("\t", array_keys($results[0])) . "\n";
     if (count($results)) {
         foreach ($results as $r) {
             $str .= implode("\t", $r);
             $str .= "\n";
         }
     }
     $str = trim($str);
     return $str;
 }
Esempio n. 4
0
 private function importLdaData($lobby_import)
 {
     $path = $this->_dir . $lobby_import->filename;
     $raw = file_get_contents($path);
     $xml = new SimpleXMLElement($raw);
     $filings = $xml->Filing;
     $limit = count($filings);
     $this->printDebug('importing data from ' . $lobby_import->filename . ' (record ' . $lobby_import->offset . ' of ' . $limit . ')');
     for ($n = (int) $lobby_import->offset; $n < $limit; $n++) {
         $this->_count = $this->_count + 1;
         if ($this->_count > $this->_limit) {
             die;
         }
         try {
             $this->db->beginTransaction();
             $lobby_import->offset = $n;
             if ($n == $limit - 1) {
                 $lobby_import->done = 1;
             }
             $lobby_import->save();
             if (!isset($filings[$n])) {
                 echo 'ok';
                 var_dump($filings[$n - 1]);
                 var_dump($filings[$n + 1]);
                 $this->printDebug('not set' . $n);
                 $this->db->commit();
                 continue;
             }
             $filing = $filings[$n];
             if (!isset($filing->Registrant)) {
                 $this->db->commit();
                 continue;
             }
             //var_dump($filing);
             $f = new LdaFiling();
             $f->federal_filing_id = $filing['ID'];
             $f->year = $filing['Year'];
             $f->amount = $filing['Amount'];
             $f->received = $filing['Received'];
             $f->import_id = $lobby_import->id;
             $f->offset = $n;
             //check for duplicate
             if (Doctrine::getTable('LdaFiling')->findOneByFederalFilingId($f->federal_filing_id)) {
                 $this->db->commit();
                 continue;
             }
             //set registrant
             if (!($r = Doctrine::getTable('LdaRegistrant')->findOneByFederalRegistrantId($filing->Registrant['RegistrantID']))) {
                 $r = new LdaRegistrant();
                 $r->name = LsString::spacesToSpace($filing->Registrant['RegistrantName']);
                 $r->federal_registrant_id = $filing->Registrant['RegistrantID'];
                 $r->address = $filing->Registrant['Address'];
                 $r->description = LsString::spacesToSpace($filing->Registrant['GeneralDescription']);
                 $r->country = $filing->Registrant['RegistrantCountry'];
                 $r->save();
             }
             $f->registrant_id = $r->id;
             //set client
             if ($filing->Client) {
                 if (!($c = LsQuery::getByModelAndFieldsQuery('LdaClient', array('registrant_id' => $r->id, 'federal_client_id' => $filing->Client['ClientID']))->execute()->getFirst())) {
                     $c = new LdaClient();
                     $c->name = LsString::spacesToSpace($filing->Client['ClientName']);
                     $c->federal_client_id = $filing->Client['ClientID'];
                     $c->registrant_id = $r->id;
                     $c->contact_name = LsString::spacesToSpace($filing->Client['ContactFullname']);
                     $c->description = LsString::spacesToSpace($filing->Client['GeneralDescription']);
                     $c->country = $filing->Client['ClientCountry'];
                     $c->state = $filing->Client['ClientState'];
                     $c->save();
                 }
                 $f->client_id = $c->id;
             }
             //set filing type
             if ($type = (string) $filing['Type']) {
                 //look for existing type
                 if (!($t = Doctrine::getTable('LdaType')->findOneByDescription($type))) {
                     $t = new LdaType();
                     $t->description = $type;
                     $t->save();
                 }
                 $f->type_id = $t->id;
                 unset($t);
             }
             if ($period = (string) $filing['Period']) {
                 //look for existing period
                 if (!($p = Doctrine::getTable('LdaPeriod')->findOneByDescription($period))) {
                     $p = new LdaPeriod();
                     $p->description = $period;
                     $p->save();
                 }
                 $f->period_id = $p->id;
             }
             $f->save();
             //add lobbyists
             if ($filing->Lobbyists) {
                 foreach ($filing->Lobbyists->Lobbyist as $lobbyist) {
                     $name = (string) $lobbyist['LobbyistName'];
                     if (!($l = LsQuery::getByModelAndFieldsQuery('LdaLobbyist', array('registrant_id' => $r->id, 'name' => $name))->execute()->getFirst())) {
                         $l = new LdaLobbyist();
                         $l->name = $name;
                         $l->registrant_id = $r->id;
                         $l->status = $lobbyist['LobbyistStatus'];
                         $l->indicator = $lobbyist['LobbyisteIndicator'];
                         $l->official_position = $lobbyist['OfficialPosition'];
                         $l->save();
                     }
                     $fl = new LdaFilingLobbyist();
                     $fl->filing_id = $f->id;
                     $fl->lobbyist_id = $l->id;
                     $fl->save();
                     unset($fl);
                     unset($l);
                 }
             }
             //add govt entities
             if ($filing->GovernmentEntities) {
                 foreach ($filing->GovernmentEntities->GovernmentEntity as $govt) {
                     $govt = trim($govt['GovEntityName']);
                     if (!($g = Doctrine::getTable('LdaGovt')->findOneByName($govt))) {
                         $g = new LdaGovt();
                         $g->name = $govt;
                         $g->save();
                     }
                     $fg = new LdaFilingGovt();
                     $fg->filing_id = $f->id;
                     $fg->govt_id = $g->id;
                     $fg->save();
                     unset($fg);
                     unset($g);
                 }
             }
             //add issues
             if ($filing->Issues) {
                 foreach ($filing->Issues->Issue as $issue) {
                     $code = (string) $issue['Code'];
                     if (!($i = Doctrine::getTable('LdaIssue')->findOneByName($code))) {
                         $i = new LdaIssue();
                         $i->name = $code;
                         $i->save();
                     }
                     $fi = new LdaFilingIssue();
                     $fi->filing_id = $f->id;
                     $fi->issue_id = $i->id;
                     $fi->specific_issue = $issue['SpecificIssue'];
                     $fi->save();
                     unset($fi);
                     unset($i);
                 }
             }
             $this->printDebug($f->federal_filing_id);
             //check for duplicate again
             if (Doctrine::getTable('LdaFiling')->findOneByFederalFilingId($f->federal_filing_id)) {
                 $this->db->rollback();
                 continue;
             }
             $this->db->commit();
         } catch (Exception $e) {
             $this->db->rollback();
             throw $e;
         }
         unset($f);
         unset($r);
         unset($c);
         unset($filing);
     }
     unset($xml);
     unset($raw);
     unset($filings);
 }
Esempio n. 5
0
 public function parseResults($match)
 {
     if (isset($match['bio'])) {
         $bio_dirty = LsHtml::replaceEntities(LsString::spacesToSpace(LsHtml::stripTags($match['bio'], "; ")));
         $bio_dirty = preg_replace('/(\\;\\s)+/is', '; ', $bio_dirty);
     }
     foreach ($match as $k => &$m) {
         $m = LsHtml::replaceEntities(LsString::spacesToSpace(LsHtml::stripTags($m, " ")));
     }
     if (isset($match['name'])) {
         $name = $match['name'];
         $bio = '';
         if (isset($match['bio'])) {
             $bio = $match['bio'];
         }
     } else {
         return;
     }
     $this->printDebug("_________________________\n\nname: " . $name . "\n");
     $this->printDebug("bio: " . $bio . "\n");
     $accept = strtolower($this->readline('Process this entity? (n to skip) '));
     if ($accept == 'n' || $accept == 'no') {
         return false;
     }
     if (!$this->org_org) {
         if ($this->last_first) {
             $entity = PersonTable::parseCommaName($name);
         } else {
             $entity = PersonTable::parseFlatName($name);
         }
         $similar_entities = PersonTable::getSimilarQuery2($entity)->execute();
     } else {
         $entity = new Entity();
         $entity->addExtension('Org');
         foreach ($this->org_extensions as $ext) {
             $entity->addExtension($ext);
         }
         $entity->setEntityField('name', $name);
         $name = trim($name);
         $name = str_replace('.', '', $name);
         $similar_entities = OrgTable::getSimilarQuery($entity)->execute();
     }
     $matched = false;
     foreach ($similar_entities as $similar_entity) {
         if ($similar_entity['primary_ext'] == 'Person') {
             $this->printDebug('  POSSIBLE MATCH: ' . $similar_entity->name . ' (Orgs :: ' . $similar_entity->getRelatedOrgsSummary() . "  Bio :: {$similar_entity->summary})");
         } else {
             $this->printDebug('  POSSIBLE MATCH: ' . $similar_entity->name . ' (Summary :: ' . $similar_entity->summary . ')');
         }
         $accept = $this->readline('  Is this the same entity? (y or n)');
         $attempts = 1;
         while ($accept != 'y' && $accept != 'n' && $attempts < 5) {
             $accept = $this->readline('  Is this the same entity? (y or n) ');
             $attempts++;
         }
         if ($accept == 'y') {
             $entity = $similar_entity;
             $matched = true;
             $this->printDebug('             [accepted]');
             //sleep(1);
             break;
         } else {
             if ($accept == 'break') {
                 break;
             }
         }
     }
     $created = false;
     if (!$matched) {
         if ($entity->getPrimaryExtension() == 'Person') {
             $this->printDebug('  New person: ' . $entity->name_first . ' ' . $entity->name_last);
         } else {
             $this->printDebug('  New org: ' . $entity->name);
         }
         $accept = $this->readline('    create this new entity? (y or n) ');
         $attempts = 1;
         while ($accept != 'y' && $accept != 'n' && $attempts < 5) {
             $accept = $this->readline('    create this new entity? (y or n) ');
             $attempts++;
         }
         if ($accept == 'y') {
             if ($entity->getPrimaryExtension() == 'Person') {
                 $this->printDebug("\n  Bio: {$bio} \n");
                 $accept = $this->readline('    Add this bio? (y or n) ');
                 $attempts = 1;
                 while ($accept != 'y' && $accept != 'n' && $attempts < 5) {
                     $accept = $this->readline('    add this bio? (y or n) ');
                     $attempts++;
                 }
                 if ($accept == 'y') {
                     $entity->summary = $bio;
                 }
             }
             $entity->save();
             $entity->addReference($this->url, null, null, $this->url_name);
             $created = true;
             $this->printDebug(' ' . $entity->name . ' saved');
             //sleep(1);
         }
     }
     if (($matched || $created) && $entity->getPrimaryExtension() == 'Person') {
         $accept = $this->readline("Parse above bio for possible relationships? (y or n) ");
         $attempts = 1;
         while ($accept != 'y' && $accept != 'n' && $attempts < 5) {
             $accept = $this->readline("Parse above bio for possible relationships? (y or n) ");
             $attempts++;
         }
         if ($accept == 'y') {
             $names = $entity->parseBio($bio_dirty);
             $this->printDebug(" Orgs that {$entity} has a position at?");
             foreach ($names as $name) {
                 $exists = false;
                 $name = trim($name);
                 $accept = $this->readline(" > {$name} ::  an org? (y or n or b to break) ");
                 $attempts = 1;
                 $accept = strtolower($accept);
                 while ($accept != 'y' && $accept != 'n' && $accept != 'b' && $attempts < 5) {
                     $accept = $this->readline("  {$name} ::  an org? (y or n or b to break) ");
                     $accept = strtolower($accept);
                     $attempts++;
                 }
                 if ($accept == 'b') {
                     break;
                 } else {
                     if ($accept == 'y') {
                         $this->printDebug(' .....looking for names.....');
                         $orgs = EntityTable::getByExtensionAndNameQuery('Org', $name)->limit(10)->execute();
                         $related_org = null;
                         foreach ($orgs as $org) {
                             $q = LsDoctrineQuery::create()->from('Relationship r')->where('entity1_id = ? and entity2_id = ?', array($entity->id, $org->id))->fetchOne();
                             if ($q) {
                                 $this->printDebug('  Position already exists, skipping...');
                                 $exists = true;
                                 break;
                             }
                             $accept = $this->readline("    Create a position relationship between {$entity->name} and {$org->name}? (y or n) ");
                             $attempts = 1;
                             while ($accept != 'y' && $accept != 'n' && $attempts < 5) {
                                 $accept = $this->readline("    Create a position relationship between {$entity->name} and {$org->name}? (y or n) ");
                                 $attempts++;
                             }
                             if ($accept == 'y') {
                                 $related_org = $org;
                                 break;
                             }
                         }
                         if (!$related_org && !$exists) {
                             $accept = $this->readline(" couldn't find org, should this one be created: {$name} (y or n) ");
                             while ($accept != 'y' && $accept != 'n' && $attempts < 5) {
                                 $accept = $this->readline(" couldn't find org, should this one be created: {$name} (y or n) ");
                                 $attempts++;
                             }
                             if ($accept == 'y') {
                                 $related_org = new Entity();
                                 $related_org->addExtension('Org');
                                 $related_org->name = preg_replace('/\\.(?!com)/i', '', $name);
                                 $extensions = $this->readline("  what extensions should this org get? (eg 'Business, LobbyingFirm, LawFirm') ");
                                 $extensions = preg_split('/\\,\\s*/isu', $extensions, -1, PREG_SPLIT_NO_EMPTY);
                                 try {
                                     foreach ($extensions as $extension) {
                                         $related_org->addExtension($extension);
                                     }
                                     $related_org->save();
                                     $related_org->addReference($this->url, null, null, $this->url_name);
                                 } catch (Exception $e) {
                                     $this->printDebug('   !!! problems with org creation, skipping');
                                     $related_org = null;
                                 }
                             }
                         }
                         if ($related_org) {
                             $q = LsDoctrineQuery::create()->from('Relationship r')->where('r.entity1_id = ? and r.entity2_id = ? and r.category_id = ?', array($entity->id, $related_org->id, 1))->fetchOne();
                             if ($q) {
                                 $this->printDebug('   (relationship already found, skipping...)');
                                 continue;
                             }
                             $relationship = new Relationship();
                             $relationship->Entity1 = $entity;
                             $relationship->Entity2 = $related_org;
                             $relationship->setCategory('Position');
                             $title = $this->readline("     Title for this position relationship? (<enter> to skip) ");
                             if (strlen($title) > 2) {
                                 $relationship->description1 = $title;
                             }
                             $current = strtolower($this->readline("      Is the relationship current? (y or n or <enter> to skip) "));
                             if (in_array($current, array('y', 'yes'))) {
                                 $relationship->is_current = 1;
                             } else {
                                 if (in_array($current, array('n', 'no'))) {
                                     $relationship->is_current = 0;
                                 }
                             }
                             $board = strtolower($this->readline("      Is the relationship a board position? (y or n or <enter> to skip) "));
                             if (in_array($board, array('y', 'yes'))) {
                                 $relationship->is_board = 1;
                             } else {
                                 if (in_array($board, array('n', 'no'))) {
                                     $relationship->is_board = 0;
                                 }
                             }
                             $relationship->save();
                             $relationship->addReference($this->url, null, null, $this->url_name);
                             $this->printDebug("     Relationship saved: {$relationship}");
                         }
                     }
                 }
             }
         }
     }
     if ($matched || $created) {
         if ($this->list) {
             $q = LsDoctrineQuery::create()->from('LsListEntity l')->where('l.entity_id = ? and l.list_id = ?', array($entity->id, $this->list->id))->fetchOne();
             if (!$q) {
                 $le = new LsListEntity();
                 $le->Entity = $entity;
                 $le->LsList = $this->list;
                 if (isset($match['rank'])) {
                     if (preg_match('/(\\d+)/isu', $match['rank'], $m)) {
                         $le->rank = $m[1];
                     }
                 }
                 $le->save();
                 $this->printDebug('List membership saved');
             }
         }
         if ($this->org) {
             $q = LsDoctrineQuery::create()->from('Relationship r')->where('r.entity1_id = ? and r.entity2_id = ? and r.category_id = ?', array($entity->id, $this->org->id, 1))->fetchOne();
             if ($q) {
                 $this->printDebug('   (relationship already found, skipping...)');
                 return;
             }
             $relationship = new Relationship();
             $relationship->Entity1 = $entity;
             $relationship->Entity2 = $this->org;
             $relationship->setCategory($this->relationship_category);
             if ($this->description1) {
                 $relationship->description1 = $this->description1;
             } else {
                 $description = $this->readline("       what description to give this relationship ({$relationship}) ? (less than 3 chars will skip)");
                 if (strlen($description) > 2) {
                     $relationship->description1 = $description;
                 }
             }
             if ($this->relationship_category == 'Position') {
                 $relationship->is_board = $this->is_board;
             } else {
                 if ($this->relationship_category == 'Donation') {
                     if ($this->amount) {
                         $relationship->amount = $this->amount;
                     } else {
                         $amount = $this->readline("  what amount ({$relationship}) ? (less than 3 chars will skip)");
                         if (strlen($amount) > 1) {
                             $relationship->amount = $amount;
                         }
                     }
                 }
             }
             $relationship->save();
             $relationship->addReference($this->url, null, null, $this->url_name);
             $this->printDebug(" Relationship saved: {$relationship}");
         }
     }
     //dump history
     if (isset($match['affiliation1'])) {
         $affiliation = $match['affiliation'];
         //$this->printDebug($affiliation);
     }
 }
Esempio n. 6
0
 public function getCleanFirstParagraph()
 {
     if ($this->_paragraphs) {
         return null;
     }
     $first = $this->_paragraphs[0];
     $first = LsString::spacesToSpace(LsHtml::replaceEntities(LsHtml::stripTags($first)));
     return $first;
 }
 private function getProxyData($roster, $url, $proxy_year)
 {
     echo "fetching data from proxy at {$url} \n\n";
     $people_count = 0;
     if (!$this->browser->get($url)->responseIsError()) {
         $this->proxyText = $this->browser->getResponseText();
         $this->proxyText = LsHtml::replaceEntities($this->proxyText, ENT_QUOTES, 'UTF-8');
         $this->proxyText = LsString::utf8TransUnaccent($this->proxyText);
         foreach ($roster as &$r) {
             //make sure this is not form 4 data for a corporation, continue to the next if it is
             if ($r['officerTitle'] == '' && $r['isDirector'] != 1 && strtoupper($r['isDirector']) != strtoupper('true')) {
                 continue;
             }
             //echo $re;
             $parts = preg_split("/[\\s|\\.]+/", $r['personName'], -1, PREG_SPLIT_NO_EMPTY);
             //first word, but has to be part of last name because form4 names are in format RUBIN ROBERT E
             $last = trim($parts[0]);
             //sometimes O'LEARY can appear as O LEARY in the form 4
             if (strlen($last) == 1) {
                 $r['personName'] = $last . substr($r['personName'], 2);
                 $parts = preg_split("/[\\s|\\.]+/", $r['personName'], -1, PREG_SPLIT_NO_EMPTY);
                 $last = trim($parts[0]);
             }
             //prepare regex to match occurrences of full name
             //case insensitive to accommodate for various irregularities in names
             $re = LsLanguage::buildLooseNameRegex($r['personName']);
             $offset = 0;
             $found = true;
             //use stripos (much faster than regex) to find occurrences of the first word in the form 4 name (assumed to be part of the last name)
             //needs to be case insensitive
             //continue searching for last name in proxy until a matching full name (proxyName) is found
             while (!isset($r['proxyName']) && $found !== false) {
                 $found = stripos($this->proxyText, $last, $offset);
                 //$this->printDebug('found at pos:' . $found);
                 $offset = $found + 1;
                 if ($found !== false) {
                     $str = substr($this->proxyText, $found - 70, 120);
                     //$this->printDebug('found string: ' . $str);
                     //$this->printDebug($re);
                     preg_match_all($re, $str, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER);
                     //$this->printDebug('matchcount is ' . count($matches));
                     foreach ($matches as $match) {
                         if (stristr($match[1][0], '=')) {
                             continue;
                         }
                         //since we may or may not be working with the full last name, use getLastName to return full last name
                         $new_last = $this->getLastName($r['personName'], $match[1][0]);
                         if ($new_last) {
                             //if last name produced by case insensitive search has no capital letters, not a match
                             if (preg_match('/\\p{Lu}/su', $new_last) == 0) {
                                 continue;
                             }
                             //now that we have a last name, pull the full name from the string
                             $name = LsLanguage::getNameWithLast($match[0][0], $new_last);
                             if ($name) {
                                 $parts = preg_split('/\\s+/isu', $name['nameStart'], -1, PREG_SPLIT_NO_EMPTY);
                                 $non_prefixes = array_diff($parts, PersonTable::$nameParsePrefixes);
                                 //if all we've found are matching prefixes, not a match
                                 if (count($non_prefixes) == 0) {
                                     continue;
                                 } else {
                                     $name1_parts = preg_split('/\\s+/', $r['personName'], -1, PREG_SPLIT_NO_EMPTY);
                                     $ct = 0;
                                     //compatibility check to correct for vagueness of regex
                                     foreach ($non_prefixes as $n) {
                                         foreach ($name1_parts as $p) {
                                             if (stripos($n, $p) === 0 || stripos($p, $n) === 0) {
                                                 $ct++;
                                             }
                                         }
                                     }
                                     //phew -- if name is (somewhat) compatible, assume we've found it
                                     if ($ct > 0) {
                                         $r['proxyUrl'] = $url;
                                         $r['proxyYear'] = $proxy_year;
                                         $r['nameLast'] = trim(LsString::spacesToSpace($name['nameLast']));
                                         $r['proxyName'] = trim(LsString::spacesToSpace($name['nameFull']));
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
         unset($r);
     } else {
         //Error response (eg. 404, 500, etc)
         $log = fopen($this->logFile, 'a');
         fwrite($log, "Couldn't get " . $url . "\n");
         fclose($log);
     }
     return $roster;
 }