Ejemplo n.º 1
0
 public function executeParseNyDonations($request)
 {
     $this->reference_form = new ReferenceForm();
     if ($request->isMethod('post')) {
         $refParams = $request->getParameter('reference');
         $url = $refParams['source'];
         $this->reference_form->bind($refParams);
         $commit = $request->getParameter('commit');
         if (preg_match('/^http...www.elections.state.ny.us.8080.plsql_browser.*$/is', $url, $match)) {
             $browser = new sfWebBrowser();
             if (!$browser->get($url)->responseIsError()) {
                 $text = $browser->getResponseText();
                 $text = LsHtml::parseNyDonations($text);
                 $this->parsed_text = $text;
             }
         }
     }
 }
Ejemplo n.º 2
0
 private function findPersonBio($page, $person, $org)
 {
     //$this->printDebug('');
     $name_re = LsString::escapeStringForRegex($person->name_last);
     if (preg_match('/<title>([^<]*)<\\/title>/is', $page, $match)) {
         if (stristr($match[1], $person->name_last) && stristr($match[1], $person->name_first) && strlen($person->name_first) > 2) {
             $name_re .= '|' . LsString::escapeStringForRegex($person->name_first);
         }
     }
     $layout_tags = implode('|', LsHtml::$layoutTags);
     $re2 = '/>([^<]*?(' . $name_re . ')(\\s|,|<)(.*?))<\\/?(' . $layout_tags . ')/is';
     $re = $re2 . 'u';
     //$this->printDebug($re);
     $bio_match = null;
     if (preg_match_all($re, $page, $matches) || preg_match_all($re2, $page, $matches)) {
         //$this->printDebug('matches found');
         $arr = array();
         $most_reqs = 0;
         $qual = false;
         $news = false;
         foreach ($matches[1] as $match) {
             if (stristr($match, '}') || stristr($match, '{') || preg_match('/\\svar\\s/is', $match)) {
                 //$this->printDebug('FAILED - curly brackets');
                 continue;
             }
             $str = LsHtml::replaceEntities($match);
             $str = LsHtml::stripTags($str, '');
             $str = trim(LsString::spacesToSpace($str));
             $this->printDebug(strlen($str));
             if (strlen($str) > 3000) {
                 $this->printDebug('FAILED - str too long');
                 continue;
             }
             if (preg_match('/(^|\\b)(' . $name_re . ')\\b/is', $str) == 0) {
                 $this->printDebug($match . 'FAILED - no name match');
                 continue;
             }
             $word_count = count(explode(' ', $str));
             if ($word_count < 12) {
                 $this->printDebug('FAILED - str not long enough');
                 continue;
             } else {
                 if (stristr($str, 'announce') || stristr($str, 'today') || stristr($str, '—') || stristr($str, '–') || preg_match('/^[^\\-]{0,100}\\-(\\-|\\s)/is', $str)) {
                     $news = true;
                     $this->printDebug('FAILED: dash / announced / today');
                 } else {
                     if (preg_match('/(^|\\s)([\'"”])([^\\1]+)\\1/is', $str, $qm) && count(explode(' ', $qm[0])) > 6) {
                         $news = true;
                         $this->printDebug('FAILED: quote');
                     } else {
                         if (preg_match_all('/\\s(\\p{Ll})+\\b/su', $str, $lcm) < 5) {
                             $this->printDebug('FAILED: not enough lowercase');
                         } else {
                             $bio_words = PersonTable::$commonBioWords;
                             if (in_array('Lobbyist', $person->getExtensions())) {
                                 $bio_words = array_merge($bio_words, LobbyistTable::$commonBioWords);
                             }
                             $bio_words = implode('|', $bio_words);
                             $bio_word_ct = preg_match_all('/\\s(' . $bio_words . ')\\s/is', $str, $matches);
                             $str = trim($str);
                             if (preg_match('/\\.$/is', $str) == 0) {
                                 $this->printDebug('no period at end of string');
                             } else {
                                 if ($bio_word_ct > 1) {
                                     $news = false;
                                     $qual = true;
                                     $arr[] = $str;
                                 } else {
                                     $this->printDebug('less than 2 bio words');
                                     if ($news == false) {
                                         $str = preg_replace('/^[\\,\\.\\:\\;]\\s*/su', '', $str);
                                         $arr[] = $str;
                                         //array('str' => $str, 'bio_words' => $bio_word_ct);
                                     }
                                 }
                             }
                         }
                     }
                 }
                 //$this->printDebug('');
             }
         }
         if ($qual) {
             $arr = array_unique($arr);
             $ret = false;
             $bio = implode("\n\n", $arr);
             //$this->printDebug($name_re);
             if (strlen($bio) < 3000 && LsString::withinN($bio, '(' . $name_re . ')', '(is|was|holds|led|has|had|provides|practices|served|leads)', 2)) {
                 if (preg_match('/^.*?\\b(' . $name_re . ')\\b/is', $bio, $m) && count(explode(' ', $m[0])) < 20) {
                     $ret = true;
                     $this->printDebug('SUCCESS');
                 }
             } else {
                 $this->printDebug('within N failed !!!!');
             }
             $org_test = true;
             if ($ret && stristr($org->name, $person->name_last)) {
                 $org_test = false;
                 if (strlen($person->name_first) > 1) {
                     if (preg_match('/([^\\s]+\\s+){0,14}/is', $arr[0], $beg_match)) {
                         $nf_re = LsString::escapeStringForRegex($person->name_first);
                         if (preg_match('/\\b' . $nf_re . '\\b/is', $beg_match[0]) || preg_match('/\\b(Mr|Mrs|Ms)\\b/su', $arr[0])) {
                             $org_test = true;
                             //$this->printDebug('PASSED FIRST NAME TEST');
                         }
                     }
                 } else {
                     if (preg_match('/\\b(he|she|him|her|his|mr|ms|mrs)\\b/is', $arr[0])) {
                         $org_test = true;
                         //$this->printDebug('PASSED POSSESSIVE TEST');
                     }
                 }
             }
             if ($ret && $org_test) {
                 return $bio;
             }
         }
     } else {
         $this->printDebug('no matches found');
     }
     return false;
 }
Ejemplo n.º 3
0
 protected function importGovernor($row)
 {
     $url = $this->_baseUrl . $row['url'];
     if (!$this->browser->get($url)->responseIsError()) {
         $text = $this->browser->getResponseText();
         $text = LsHtml::replaceEntities($text);
         //preg_match('/>Family\:<\/b>([^<]*)<br/is',$text,$family_arr);
         $name = trim(str_ireplace('Gov.', '', $row['name']));
         $this->printDebug('');
         $this->printDebug($name . ':');
         $governor = PersonTable::parseFlatName($name);
         $governor->addExtension('PoliticalCandidate');
         $governor->addExtension('ElectedRepresentative');
         $governor->is_state = 1;
         $similar = $governor->getSimilarEntitiesQuery(true)->execute();
         foreach ($similar as $s) {
             $sim_re = LsString::escapeStringForRegex($s->name_first);
             $search_re = LsString::escapeStringForRegex($governor->name_first);
             if (preg_match('/^' . $sim_re . '/su', $governor->name_first) == 0 && preg_match('/^' . $search_re . '/su', $s->name_first) == 0) {
                 continue;
             }
             $bio = $s->getExtendedBio();
             if (preg_match('/\\bgovernor(ship)?\\b/isu', $bio)) {
                 $governor = $s;
                 $this->printDebug(' Found existing governor: ' . $s->name . ' ' . $s->id);
                 break;
             }
         }
         $governor->save();
         $this->printDebug($governor->id);
         if (!$governor->start_date && preg_match('/>Born\\:<\\/b>([^<]*)<br/is', $text, $birth_arr)) {
             $this->printDebug(' Birthdate: ' . $birth_arr[1]);
             $governor->start_date = trim($birth_arr[1]);
         }
         if (!$governor->birthplace && preg_match('/>Birth State\\:<\\/b>([^<]*)<br/is', $text, $birth_state_arr)) {
             $this->printDebug(' Birthplace: ' . trim($birth_state_arr[1]));
             $governor->birthplace = trim($birth_state_arr[1]);
         }
         //PARTY MEMBERSHIP
         if (preg_match('/>Party\\:<\\/b>([^<]*)<br/is', $text, $party_arr)) {
             $party_str = $party_arr[1];
             $this->printDebug(' Party: ' . $party_str);
             if (stristr($party_str, 'Democrat')) {
                 $party = EntityTable::getByExtensionQuery('PoliticalParty')->addWhere('name = ?', 'Democratic Party')->fetchOne();
             }
             if (stristr($party_str, 'Republican')) {
                 $party = EntityTable::getByExtensionQuery('PoliticalParty')->addWhere('name = ?', 'Republican Party')->fetchOne();
             }
             if (isset($party) && $party && !$governor->party_id) {
                 $governor->Party = $party;
                 $governor->is_independent = false;
                 $this->printDebug(' Added membership in ' . $party);
             } else {
                 if (stristr($party_str, 'Independent')) {
                     $governor->is_independent = true;
                 }
             }
         }
         if (!$governor->summary && preg_match_all('/>([^<]{240,})/isu', $text, $bio_match)) {
             $str = '';
             foreach ($bio_match[1] as $b) {
                 if (!stristr($b, 'Javascript')) {
                     $str .= "\n\n" . $b;
                 }
             }
             $str = trim($str);
             if (strlen($str)) {
                 $governor->summary = $str;
             }
         }
         $governor->save();
         $governor->addReference($url, null, $governor->getAllModifiedFields(), 'Governors Association');
         //SCHOOLS
         if (preg_match('/>School\\(s\\)\\:<\\/b>([^<]*)<br/is', $text, $school_arr)) {
             $school_names = explode(';', trim($school_arr[1]));
             if (count($school_names) == 1) {
                 $school_names = explode(',', $school_names[0]);
             }
             foreach ($school_names as $school_name) {
                 $school_name = trim($school_name);
                 if (!($school = EntityTable::getByExtensionQuery('School')->leftJoin('e.Alias a')->addWhere('e.name = ? or a.name = ?', array($school_name, $school_name))->fetchOne())) {
                     $school = new Entity();
                     $school->addExtension('Org');
                     $school->addExtension('School');
                     $school->name = $school_name;
                     $school->save();
                     $this->printDebug(' Added School: ' . $school_name);
                 }
                 $q = RelationshipTable::getByCategoryQuery('Education')->addWhere('entity1_id = ? and entity2_id = ?', array($governor->id, $school->id))->fetchOne();
                 if (!$q) {
                     $relationship = new Relationship();
                     $relationship->setCategory('Education');
                     $relationship->Entity1 = $governor;
                     $relationship->Entity2 = $school;
                     $relationship->is_current = 0;
                     $relationship->save();
                     $relationship->addReference($url, null, $relationship->getAllModifiedFields(), 'Governors Association');
                     $this->printDebug(' Added education: ' . $relationship->name);
                 }
             }
         }
         //GOVERNOR OFFICE AND POSITION
         $office_name = 'Office of the Governor of ' . $row['state'];
         if (!($office = EntityTable::getByExtensionQuery('GovernmentBody')->addWhere('name = ?', $office_name)->fetchOne())) {
             $office = new Entity();
             $office->name = $office_name;
             $office->addExtension('Org');
             $office->addExtension('GovernmentBody');
             $state = Doctrine::getTable('AddressState')->findOneByName($row['state']);
             if ($state) {
                 $office->state_id = $state->id;
             }
             $office->save();
             $office->addReference($url, null, $office->getAllModifiedFields(), 'Governors Association');
             $this->printDebug(' Added office: ' . $office->name);
         }
         $q = RelationshipTable::getByCategoryQuery('Position')->addWhere('entity1_id = ? and entity2_id = ? and description1 = ?', array($governor->id, $office->id, 'Governor'))->fetchOne();
         if (!$q) {
             sort($row['years']);
             $i = 0;
             while ($i < count($row['years'])) {
                 $governorship = new Relationship();
                 $governorship->setCategory('Position');
                 $governorship->Entity1 = $governor;
                 $governorship->Entity2 = $office;
                 $governorship->description1 = 'Governor';
                 $governorship->start_date = $row['years'][$i];
                 $i++;
                 if (isset($row['years'][$i])) {
                     $governorship->end_date = $row['years'][$i];
                     $governorship->is_current = 0;
                     if (!$governor->blurb && !isset($row['years'][$i + 1])) {
                         $governor->blurb = 'Former Governor of ' . $row['state'];
                     }
                 } else {
                     $governorship->is_current = 1;
                     if (!$governor->blurb) {
                         $governor->blurb = 'Governor of ' . $row['state'];
                     }
                 }
                 $governor->save();
                 $i++;
                 $governorship->save();
                 $governorship->addReference($url, null, $governorship->getAllModifiedFields(), 'Governors Association');
                 $this->printDebug(' Added governorship: ' . $governorship->name);
             }
         }
         //SPOUSE
         if (preg_match('/>Spouse\\:<\\/b>(.*?)<br/is', $text, $spouse_arr)) {
             $spouse = trim(LsHtml::stripTags($spouse_arr[1]));
             $q = RelationshipTable::getByCategoryQuery('Family')->addWhere('entity1_id = ? or entity2_id = ?', array($governor->id, $governor->id))->fetchOne();
             if (!$q && strlen($spouse)) {
                 $spouse = PersonTable::parseFlatName($spouse);
                 $spouse->save();
                 $this->printDebug(' Added spouse: ' . $spouse->name);
                 $relationship = new Relationship();
                 $relationship->setCategory('Family');
                 $relationship->Entity1 = $spouse;
                 $relationship->Entity2 = $governor;
                 $relationship->description1 = 'Spouse';
                 $relationship->description2 = 'Spouse';
                 $relationship->save();
                 $relationship->addReference($url, null, $relationship->getAllModifiedFields(), 'Governors Association');
                 $this->printDebug(' Added spouse relationship: ' . $relationship->name);
             }
         }
         //ADDRESS --not working, malformed addresses
         /*
               if (preg_match('/>Address\:\s*<\/b>(.*?)<b>/is',$text,$address_arr))      
               {
                 $address = trim(str_replace('<br/>',', ',$address_arr[1]));
                 $this->printDebug($address);
                 if ($governor->Address->count() == 0 && $a = $governor->addAddress($address))
                 {
                   $this->printDebug(' Address: ' . $a);
                   $governor->save();
                 }
               }*/
         //PHONE NUMBER
         if (preg_match('/>Phone\\(s\\)\\:<\\/b>([^<]*)<br/is', $text, $phone_arr)) {
             $phone_number = trim($phone_arr[1]);
             if (!$governor->Phone->count()) {
                 $phone = $governor->addPhone($phone_number);
                 $this->printDebug(' Phone: ' . $phone);
             }
         }
         if (!$governor->Image->count() && preg_match('/<img .*?class\\="display" src\\="([^"]*)"/is', $text, $img_arr)) {
             $url = $img_arr[1];
             try {
                 $fileName = ImageTable::createFiles($url, $governor->name_first);
             } catch (Exception $e) {
                 $fileName = null;
             }
             if ($fileName) {
                 //insert image record
                 $image = new Image();
                 $image->filename = $fileName;
                 $image->entity_id = $governor->id;
                 $image->title = $governor->name;
                 $image->caption = 'From Governors Association website';
                 $image->is_featured = true;
                 $image->is_free = false;
                 $image->url = $url;
                 $image->save();
                 $this->printDebug("Imported image: " . $image->filename);
             }
         }
     }
 }
 /**
  *  get donor info       
  */
 private function generateDonor($text)
 {
     $text_arr = explode("<BR>", $text);
     //var_dump($text_arr[0]);
     $donor = $this->generatePerson(LsHtml::stripTags($text_arr[0], ''));
     $address_arr = LsLanguage::parseCityStatePostal($text_arr[1]);
     $a = new Address();
     $a->street1 = isset($address_arr['street1']) ? $address_arr['street1'] : null;
     $a->street2 = isset($address_arr['street2']) ? $address_arr['street2'] : null;
     $a->city = $address_arr['city'];
     if ($state = AddressStateTable::retrieveByText($address_arr['state'])) {
         $a->State = $state;
     }
     $a->postal = $address_arr['zip'];
     $donor->addAddress($a);
     $donor->summary = strip_tags(trim($text_arr[2]));
     return $donor;
 }
Ejemplo n.º 5
0
 public function getSummary($str, Entity $e)
 {
     $str = LsHtml::replaceEntities($str);
     $name_re = array();
     $name_re[] = $e->getNameRegex();
     if ($e->name_nick && $e->name_nick != '') {
         $name_re[] = LsString::escapeStringForRegex($e->name_nick);
     }
     $name_re = implode('|', $name_re);
     $style_tags = implode('|', LsHtml::$fontStyleTags);
     $layout_tags = implode('|', LsHtml::$layoutTags);
     $re = '/((' . $name_re . ')(.*?))<\\/?(' . $layout_tags . ')/isu';
     $this->printDebug($re);
     $results = null;
     if (preg_match_all($re, $str, $matches)) {
         $results = $matches[1];
         foreach ($results as $result) {
             $result = LsString::spacesToSpace(LsHtml::stripTags($result));
             $this->printDebug($result);
         }
     }
     return $results;
 }
Ejemplo n.º 6
0
 function getBusinessWeek(Entity $person)
 {
     /*
         $yahoo = new LsYahoo;
         $yahoo->setService('Web Search');
         $yahoo->setSite('http://investing.businessweek.com');
         $yahoo->setQuery($person->name);
         $this->printDebug($yahoo->getQueryUrl());
     
         $yahoo->execute();    
         $results = $yahoo->getResults();  */
     $google_scraper = new LsGoogle();
     $google_scraper->setQuery('site:investing.businessweek.com ' . $person->name);
     $this->printDebug('site:investing.businessweek.com ' . $person->name);
     $google_scraper->execute();
     if (!$google_scraper->getNumResults()) {
         return null;
     }
     $results = $google_scraper->getResults();
     $businessweek_profile = null;
     foreach ($results as $result) {
         $this->printDebug($result->unescapedUrl);
         if (preg_match('/^.*?person\\.asp\\?personId=\\d+/is', $result->unescapedUrl, $match)) {
             $businessweek_profile = $match[0];
             break;
         }
     }
     if (!$businessweek_profile) {
         foreach ($results as $result) {
             $url = $result->unescapedUrl;
             if (preg_match('/^(.*?)\\&/is', $url, $match)) {
                 $url = $match[1];
             }
             if (!stristr($url, 'http://')) {
                 $url = 'http://investing.businessweek.com/' . $url;
             }
             $this->printDebug('new url: ' . $url);
             if (!$this->browser->get($url)->responseIsError()) {
                 $text = $this->browser->getResponseText();
                 //var_dump($text);
                 $links = LsHtml::matchLinks($text);
                 foreach ($links as $link) {
                     if (preg_match('/' . $person->getNameRegex(true) . '/s', $link['text']) && preg_match('/^.*?person\\.asp\\?personId=\\d+/is', $link['url'], $match)) {
                         $url = $match[0];
                         if (!stristr($url, 'http://')) {
                             $url = 'http://investing.businessweek.com/' . $url;
                         }
                         $businessweek_profile = $url;
                         break;
                     }
                 }
                 if ($businessweek_profile) {
                     $this->printDebug('Businessweek profile found on 2nd attempt: ' . $businessweek_profile);
                     break;
                 }
             }
         }
         if (!$businessweek_profile) {
             $this->printDebug('Buisnessweek profile not found');
             return;
         }
     }
     $education_found = false;
     $employment_found = false;
     $summary_found = false;
     $ed_matched = false;
     //go to businessweek profile and get education
     $this->browser->get($businessweek_profile);
     if ($text = $this->browser->getResponseText()) {
         //$education = null;
         //$employment = null;
         if (preg_match('#EDUCATION[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<h2#is', $text, $education)) {
             $ed_matched = preg_match_all('/<strong>(.+?)<\\/strong>\\s*(\\d{4})?\\s*<\\/div><div.*?>(.+?)</s', $education[1], $education_found);
         }
         if (preg_match('#OTHER AFFILIATIONS[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<\\/td#s', $text, $employment)) {
             preg_match_all('#href\\=\\".+?\\"\\>(.+?)\\<\\/a\\>#is', $employment[1], $employment_found);
         }
         preg_match('#BACKGROUND[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<\\/p>#s', $text, $summary_found);
         $summary_found = strip_tags($summary_found[1]);
         //var_dump($summary_found);
         if ($ed_matched) {
             $this->printDebug('Education info found at Businessweek');
         } else {
             $this->printDebug('Education info not found at Businessweek');
             return;
         }
     } else {
         $this->printDebug('Businessweek browser error');
         return;
     }
     $education_history = null;
     $employment_history = null;
     $wikipedia = new LsWikipedia();
     $wikipedia->request($person->name);
     $wikipedia->execute();
     $plaintext = $wikipedia->getPlainText();
     foreach ($education_found[3] as $key => $institution) {
         $arr = null;
         $arr['institution'] = $institution;
         $arr['degree'] = $education_found[1][$key];
         $arr['year'] = null;
         if ($education_found[2][$key] != '') {
             $arr['year'] = $education_found[2][$key];
         }
         $wikipedia_matches = LsLanguage::getCommonPronouns($arr['institution'], $plaintext, array_merge(LsLanguage::$business, LsLanguage::$schools, LsLanguage::$grammar));
         if ($wikipedia_matches) {
             $arr['source'] = 'http://en.wikipedia.org/wiki/' . str_replace('+', '_', $wikipedia->getTitle());
         } else {
             $arr['source'] = $businessweek_profile;
         }
         $education_history[] = (object) $arr;
     }
     foreach ($employment_found[1] as $key => $company) {
         $arr = null;
         $arr['company'] = $company;
         $arr['title'] = null;
         $employment_history[] = (object) $arr;
     }
     $possible_person = array('name' => $person->name, 'summary' => $summary_found, 'employment_history' => (object) $employment_history, 'education' => (object) $education_history);
     $possible_persons[] = (object) $possible_person;
     $this->import($person, $possible_persons);
 }
Ejemplo n.º 7
0
 public function parseResults($match)
 {
     if (isset($match['bio'])) {
         $bio_dirty = LsHtml::replaceEntities(LsString::spacesToSpace(LsHtml::stripTags($match['bio'], "; ")));
         $bio_dirty = preg_replace('/(\\;\\s)+/is', '; ', $bio_dirty);
     }
     foreach ($match as $k => &$m) {
         $m = LsHtml::replaceEntities(LsString::spacesToSpace(LsHtml::stripTags($m, " ")));
     }
     if (isset($match['name'])) {
         $name = $match['name'];
         $bio = '';
         if (isset($match['bio'])) {
             $bio = $match['bio'];
         }
     } else {
         return;
     }
     $this->printDebug("_________________________\n\nname: " . $name . "\n");
     $this->printDebug("bio: " . $bio . "\n");
     $accept = strtolower($this->readline('Process this entity? (n to skip) '));
     if ($accept == 'n' || $accept == 'no') {
         return false;
     }
     if (!$this->org_org) {
         if ($this->last_first) {
             $entity = PersonTable::parseCommaName($name);
         } else {
             $entity = PersonTable::parseFlatName($name);
         }
         $similar_entities = PersonTable::getSimilarQuery2($entity)->execute();
     } else {
         $entity = new Entity();
         $entity->addExtension('Org');
         foreach ($this->org_extensions as $ext) {
             $entity->addExtension($ext);
         }
         $entity->setEntityField('name', $name);
         $name = trim($name);
         $name = str_replace('.', '', $name);
         $similar_entities = OrgTable::getSimilarQuery($entity)->execute();
     }
     $matched = false;
     foreach ($similar_entities as $similar_entity) {
         if ($similar_entity['primary_ext'] == 'Person') {
             $this->printDebug('  POSSIBLE MATCH: ' . $similar_entity->name . ' (Orgs :: ' . $similar_entity->getRelatedOrgsSummary() . "  Bio :: {$similar_entity->summary})");
         } else {
             $this->printDebug('  POSSIBLE MATCH: ' . $similar_entity->name . ' (Summary :: ' . $similar_entity->summary . ')');
         }
         $accept = $this->readline('  Is this the same entity? (y or n)');
         $attempts = 1;
         while ($accept != 'y' && $accept != 'n' && $attempts < 5) {
             $accept = $this->readline('  Is this the same entity? (y or n) ');
             $attempts++;
         }
         if ($accept == 'y') {
             $entity = $similar_entity;
             $matched = true;
             $this->printDebug('             [accepted]');
             //sleep(1);
             break;
         } else {
             if ($accept == 'break') {
                 break;
             }
         }
     }
     $created = false;
     if (!$matched) {
         if ($entity->getPrimaryExtension() == 'Person') {
             $this->printDebug('  New person: ' . $entity->name_first . ' ' . $entity->name_last);
         } else {
             $this->printDebug('  New org: ' . $entity->name);
         }
         $accept = $this->readline('    create this new entity? (y or n) ');
         $attempts = 1;
         while ($accept != 'y' && $accept != 'n' && $attempts < 5) {
             $accept = $this->readline('    create this new entity? (y or n) ');
             $attempts++;
         }
         if ($accept == 'y') {
             if ($entity->getPrimaryExtension() == 'Person') {
                 $this->printDebug("\n  Bio: {$bio} \n");
                 $accept = $this->readline('    Add this bio? (y or n) ');
                 $attempts = 1;
                 while ($accept != 'y' && $accept != 'n' && $attempts < 5) {
                     $accept = $this->readline('    add this bio? (y or n) ');
                     $attempts++;
                 }
                 if ($accept == 'y') {
                     $entity->summary = $bio;
                 }
             }
             $entity->save();
             $entity->addReference($this->url, null, null, $this->url_name);
             $created = true;
             $this->printDebug(' ' . $entity->name . ' saved');
             //sleep(1);
         }
     }
     if (($matched || $created) && $entity->getPrimaryExtension() == 'Person') {
         $accept = $this->readline("Parse above bio for possible relationships? (y or n) ");
         $attempts = 1;
         while ($accept != 'y' && $accept != 'n' && $attempts < 5) {
             $accept = $this->readline("Parse above bio for possible relationships? (y or n) ");
             $attempts++;
         }
         if ($accept == 'y') {
             $names = $entity->parseBio($bio_dirty);
             $this->printDebug(" Orgs that {$entity} has a position at?");
             foreach ($names as $name) {
                 $exists = false;
                 $name = trim($name);
                 $accept = $this->readline(" > {$name} ::  an org? (y or n or b to break) ");
                 $attempts = 1;
                 $accept = strtolower($accept);
                 while ($accept != 'y' && $accept != 'n' && $accept != 'b' && $attempts < 5) {
                     $accept = $this->readline("  {$name} ::  an org? (y or n or b to break) ");
                     $accept = strtolower($accept);
                     $attempts++;
                 }
                 if ($accept == 'b') {
                     break;
                 } else {
                     if ($accept == 'y') {
                         $this->printDebug(' .....looking for names.....');
                         $orgs = EntityTable::getByExtensionAndNameQuery('Org', $name)->limit(10)->execute();
                         $related_org = null;
                         foreach ($orgs as $org) {
                             $q = LsDoctrineQuery::create()->from('Relationship r')->where('entity1_id = ? and entity2_id = ?', array($entity->id, $org->id))->fetchOne();
                             if ($q) {
                                 $this->printDebug('  Position already exists, skipping...');
                                 $exists = true;
                                 break;
                             }
                             $accept = $this->readline("    Create a position relationship between {$entity->name} and {$org->name}? (y or n) ");
                             $attempts = 1;
                             while ($accept != 'y' && $accept != 'n' && $attempts < 5) {
                                 $accept = $this->readline("    Create a position relationship between {$entity->name} and {$org->name}? (y or n) ");
                                 $attempts++;
                             }
                             if ($accept == 'y') {
                                 $related_org = $org;
                                 break;
                             }
                         }
                         if (!$related_org && !$exists) {
                             $accept = $this->readline(" couldn't find org, should this one be created: {$name} (y or n) ");
                             while ($accept != 'y' && $accept != 'n' && $attempts < 5) {
                                 $accept = $this->readline(" couldn't find org, should this one be created: {$name} (y or n) ");
                                 $attempts++;
                             }
                             if ($accept == 'y') {
                                 $related_org = new Entity();
                                 $related_org->addExtension('Org');
                                 $related_org->name = preg_replace('/\\.(?!com)/i', '', $name);
                                 $extensions = $this->readline("  what extensions should this org get? (eg 'Business, LobbyingFirm, LawFirm') ");
                                 $extensions = preg_split('/\\,\\s*/isu', $extensions, -1, PREG_SPLIT_NO_EMPTY);
                                 try {
                                     foreach ($extensions as $extension) {
                                         $related_org->addExtension($extension);
                                     }
                                     $related_org->save();
                                     $related_org->addReference($this->url, null, null, $this->url_name);
                                 } catch (Exception $e) {
                                     $this->printDebug('   !!! problems with org creation, skipping');
                                     $related_org = null;
                                 }
                             }
                         }
                         if ($related_org) {
                             $q = LsDoctrineQuery::create()->from('Relationship r')->where('r.entity1_id = ? and r.entity2_id = ? and r.category_id = ?', array($entity->id, $related_org->id, 1))->fetchOne();
                             if ($q) {
                                 $this->printDebug('   (relationship already found, skipping...)');
                                 continue;
                             }
                             $relationship = new Relationship();
                             $relationship->Entity1 = $entity;
                             $relationship->Entity2 = $related_org;
                             $relationship->setCategory('Position');
                             $title = $this->readline("     Title for this position relationship? (<enter> to skip) ");
                             if (strlen($title) > 2) {
                                 $relationship->description1 = $title;
                             }
                             $current = strtolower($this->readline("      Is the relationship current? (y or n or <enter> to skip) "));
                             if (in_array($current, array('y', 'yes'))) {
                                 $relationship->is_current = 1;
                             } else {
                                 if (in_array($current, array('n', 'no'))) {
                                     $relationship->is_current = 0;
                                 }
                             }
                             $board = strtolower($this->readline("      Is the relationship a board position? (y or n or <enter> to skip) "));
                             if (in_array($board, array('y', 'yes'))) {
                                 $relationship->is_board = 1;
                             } else {
                                 if (in_array($board, array('n', 'no'))) {
                                     $relationship->is_board = 0;
                                 }
                             }
                             $relationship->save();
                             $relationship->addReference($this->url, null, null, $this->url_name);
                             $this->printDebug("     Relationship saved: {$relationship}");
                         }
                     }
                 }
             }
         }
     }
     if ($matched || $created) {
         if ($this->list) {
             $q = LsDoctrineQuery::create()->from('LsListEntity l')->where('l.entity_id = ? and l.list_id = ?', array($entity->id, $this->list->id))->fetchOne();
             if (!$q) {
                 $le = new LsListEntity();
                 $le->Entity = $entity;
                 $le->LsList = $this->list;
                 if (isset($match['rank'])) {
                     if (preg_match('/(\\d+)/isu', $match['rank'], $m)) {
                         $le->rank = $m[1];
                     }
                 }
                 $le->save();
                 $this->printDebug('List membership saved');
             }
         }
         if ($this->org) {
             $q = LsDoctrineQuery::create()->from('Relationship r')->where('r.entity1_id = ? and r.entity2_id = ? and r.category_id = ?', array($entity->id, $this->org->id, 1))->fetchOne();
             if ($q) {
                 $this->printDebug('   (relationship already found, skipping...)');
                 return;
             }
             $relationship = new Relationship();
             $relationship->Entity1 = $entity;
             $relationship->Entity2 = $this->org;
             $relationship->setCategory($this->relationship_category);
             if ($this->description1) {
                 $relationship->description1 = $this->description1;
             } else {
                 $description = $this->readline("       what description to give this relationship ({$relationship}) ? (less than 3 chars will skip)");
                 if (strlen($description) > 2) {
                     $relationship->description1 = $description;
                 }
             }
             if ($this->relationship_category == 'Position') {
                 $relationship->is_board = $this->is_board;
             } else {
                 if ($this->relationship_category == 'Donation') {
                     if ($this->amount) {
                         $relationship->amount = $this->amount;
                     } else {
                         $amount = $this->readline("  what amount ({$relationship}) ? (less than 3 chars will skip)");
                         if (strlen($amount) > 1) {
                             $relationship->amount = $amount;
                         }
                     }
                 }
             }
             $relationship->save();
             $relationship->addReference($this->url, null, null, $this->url_name);
             $this->printDebug(" Relationship saved: {$relationship}");
         }
     }
     //dump history
     if (isset($match['affiliation1'])) {
         $affiliation = $match['affiliation'];
         //$this->printDebug($affiliation);
     }
 }
 public function __construct($row)
 {
     $this->id = $row[1];
     $this->name = trim(LsHtml::replaceEntities($row[2]));
     $this->lifespan = trim(LsHtml::replaceEntities($row[3]));
     $this->type = trim($row[4]);
     $this->party = trim($row[5]);
     $this->state = trim($row[6]);
     list($this->termStart, $this->termEnd) = explode('-', trim($row[7]));
 }
Ejemplo n.º 9
0
 function __construct($text)
 {
     $text = LsHtml::replaceEntities($text);
     $text = LsString::utf8TransUnaccent($text);
     $this->text = $text;
 }
Ejemplo n.º 10
0
 private function findBasicInfo()
 {
     if (!$this->sets) {
         return null;
     }
     $re = '/^([^<]*?<[^>]*>)*?[^<]*?(?<!([\\.,$\\/]))(\\b[2-9]\\d\\b)(?!((,\\s+200\\d|199\\d)|%|[,\\.]\\d|[-\\s]+([Yy]ears?\\s+(with|career)|[Dd]ays?|[Mm]onths?)\\b))/su';
     $age_match_sets = array();
     //go through the sets of name matches and find age matches for each
     foreach ($this->sets as $set) {
         $age_matches = array();
         for ($i = 0; $i < count($set); $i++) {
             $len = $i == count($set) - 1 ? 2000 : $set[$i + 1]['pos'] - $set[$i]['pos'];
             if ($len > 100000) {
                 continue;
             }
             $str = substr($this->text, $set[$i]['pos'], $len);
             if (preg_match($re, $str, $match)) {
                 $n = preg_match_all('/<(\\p{L}+)[^>]*>/s', $match[0], $m, PREG_SET_ORDER);
                 $tag = 'empty';
                 if ($n > 0) {
                     $tag = $m[count($m) - 1][1];
                 }
                 $stripped = LsHtml::stripTags($match[0]);
                 if (strlen($stripped) < 2000) {
                     $age_matches[] = array('ind' => $i, 'age_match' => $match, 'age' => $match[3], 'name_match' => $set[$i], 'num_tags' => $n, 'tag' => $tag, 'len' => strlen($match[0]));
                 }
                 //$this->printDebug($i . '. ' . $set[$i]['name'] . ' : ' . $match[3] . ' : ' . strlen($match[0]) . ' : ' . $n . ' : ' . $tag);
                 //$this->printDebug($set[$i]['match'][1][0]);
             }
             //else $this->printDebug('--');
             //$this->printDebug($set[$i]['match'][1][0]);
         }
         $this->printDebug('count age matches is ' . count($age_matches));
         $age_match_sets[] = $age_matches;
     }
     //find the best set (most unique names and ages)
     $max = 0;
     $best = array(array('unique' => array(), 'set' => array()));
     foreach ($age_match_sets as $age_matches) {
         if (count($age_matches) < 2) {
             continue;
         }
         $unique = array($age_matches[0]['name_match']['id']);
         $temp = array($age_matches[0]);
         for ($i = 1; $i < count($age_matches); $i++) {
             if ($age_matches[$i]['ind'] - 4 <= $age_matches[$i - 1]['ind']) {
                 $temp[] = $age_matches[$i];
                 if (!in_array($age_matches[$i]['name_match']['id'], $unique)) {
                     $unique[] = $age_matches[$i]['name_match']['id'];
                 }
             } else {
                 if (count($unique) > $max) {
                     $max = count($unique);
                     if (count(array_intersect($best[0]['unique'], $unique)) == 0 && count($best[0]['unique']) > 2) {
                         array_unshift($best, array('unique' => $unique, 'set' => $temp));
                     } else {
                         $best = array(array('unique' => $unique, 'set' => $temp));
                     }
                 } else {
                     if (count(array_intersect($best[0]['unique'], $unique)) == 0 && count($unique) > 2) {
                         $best[] = array('unique' => $unique, 'set' => $temp);
                     }
                 }
                 $unique = array($age_matches[$i]['name_match']['id']);
                 $temp = array($age_matches[$i]);
             }
         }
         if (count($unique) > $max) {
             $max = count($unique);
             if (count(array_intersect($best[0]['unique'], $unique)) == 0) {
                 array_unshift($best, array('unique' => $unique, 'set' => $temp));
             } else {
                 $best = array(array('unique' => $unique, 'set' => $temp));
             }
         }
     }
     $best = $best[0]['set'];
     //$this->printDebug('count best is ' . count($best));
     //find the tag all names have in common (if there is one)
     $tag_counts = array();
     foreach ($best as $b) {
         if (isset($tag_counts[$b['tag']])) {
             $tag_counts[$b['tag']]++;
         } else {
             $tag_counts[$b['tag']] = 1;
         }
         $this->printDebug($b['ind'] . '. ' . $b['name_match']['name'] . ' : ' . $b['age'] . ' : ' . strlen($b['age_match'][0]) . ' : ' . $b['num_tags'] . ' : ' . $b['tag']);
     }
     $tag = null;
     foreach ($tag_counts as $k => $v) {
         if ($v > 0.8 * count($best)) {
             $tag = $k;
             break;
         }
     }
     $age_set = array();
     if ($tag) {
         foreach ($best as $b) {
             if ($b['tag'] == $tag) {
                 $age_set[] = $b;
             }
         }
     } else {
         $age_set = $best;
     }
     $age_set = LsArray::multiSort($age_set, array('name_match', 'id'));
     //find duplicates and determine the best match out of the pair/set
     $singles = array();
     $doubles = array();
     $num_tags = 0;
     $len = 0;
     for ($i = 0; $i < count($age_set); $i++) {
         $double = array($age_set[$i]);
         while ($i < count($age_set) - 1 && $double[0]['name_match']['id'] == $age_set[$i + 1]['name_match']['id']) {
             $double[] = $age_set[$i + 1];
             $i++;
         }
         if (count($double) == 1) {
             $singles[] = $age_set[$i];
             $num_tags += $age_set[$i]['num_tags'];
             $len += $age_set[$i]['len'];
         } else {
             $doubles[] = $double;
         }
     }
     if (count($singles) < 3) {
         $unique = array();
         $sets = array(array());
         $age_set = LsArray::multiSort($age_set, array('name_match', 'pos'));
         foreach ($age_set as $a) {
             //$this->printDebug($a['name_match']['name'] . ": ");
             if (!in_array($a['name_match']['id'], $unique)) {
                 $unique[] = $a['name_match']['id'];
                 $sets[count($sets) - 1][] = $a;
             } else {
                 $unique = array($a['name_match']['id']);
                 $sets[] = array($a);
             }
         }
         $age_set = $sets[0];
     } else {
         $avg_len = $len / count($singles);
         $avg_tags = $num_tags / count($singles);
         //$this->printDebug('len is ' . $avg_len . ' and tags is ' . $avg_tags);
         foreach ($doubles as $double) {
             $best = null;
             foreach ($double as $d) {
                 $lf = $d['len'] / $avg_len;
                 $tf = $d['num_tags'] / $avg_tags;
                 $f = abs(2 - ($lf + $tf));
                 if (!$best) {
                     $best = $d;
                 } else {
                     if (abs($avg_tags - $best['num_tags']) > abs($avg_tags - $d['num_tags'])) {
                         $best = $d;
                     } else {
                         if (abs($avg_tags - $best['num_tags']) == abs($avg_tags - $d['num_tags']) && abs($avg_len - $best['len']) == abs($avg_len - $d['len'])) {
                             $best = $d;
                         }
                     }
                 }
             }
             $singles[] = $best;
         }
         $age_set = LsArray::multiSort($singles, array('name_match', 'pos'));
     }
     //determine which directors were found, which weren't
     $ids = array();
     foreach ($age_set as $a) {
         $ids[] = $a['name_match']['id'];
         //$this->printDebug($a['ind'] . '. ' . $a['name_match']['name'] . ' : ' . $a['age'] . ' : ' . strlen($a['age_match'][0]) . ' : ' . $a['num_tags'] . ' : ' . $a['tag']);
     }
     foreach ($this->people as $p) {
         if (!in_array($p->id, $ids)) {
             $category = Doctrine::getTable('RelationshipCategory')->findOneByName('Position');
             $relationship = LsDoctrineQuery::create()->from('Relationship r')->where('r.entity1_id = ?', $p->id)->addWhere('r.entity2_id = ?', $this->corp->id)->addWhere('r.category_id = ?', $category->id)->addWhere('r.description1 = ?', 'Director')->fetchOne();
             if ($relationship) {
                 $relationship->is_current = 0;
                 $relationship->save();
             }
         }
     }
     if (count($age_set) < 0.5 * count($this->people)) {
         $this->printDebug('not enough names in age set:' . count($age_set) . ' vs. ' . count($this->people));
         return null;
     }
     //figure out which tags surround name/age pairs
     $tag_arr = array('<table' => array(), '<tr' => array(), '<td' => array(), '<div' => array(), '<br' => array(), '<p' => array());
     $tag_arr = array('table' => array(), 'tr' => array(), 'td' => array(), 'div' => array(), 'br' => array(), 'p' => array());
     for ($i = 1; $i < count($age_set) - 1; $i++) {
         $str = substr($this->text, $age_set[$i - 1]['name_match']['pos'], $age_set[$i + 1]['name_match']['pos'] - $age_set[$i - 1]['name_match']['pos']);
         //$this->printDebug($str);
         foreach ($tag_arr as $tag => &$arr) {
             $tag_str = LsHtml::getStringInTag($str, $tag, $age_set[$i]['name_match']['pos'] - $age_set[$i - 1]['name_match']['pos']);
             if (strlen($tag_str) > 0) {
                 $arr[] = strlen($tag_str);
                 //$this->printDebug($tag_str);
                 //echo "\n*****\n";
             }
         }
     }
     arsort($tag_arr);
     //var_dump($tag_arr);
     //$this->printDebug(count($this->people));
     if (count(reset($tag_arr)) == 0) {
         $this->printDebug('problems with enclosing tag detection');
         return null;
     }
     foreach ($tag_arr as $tag => $arr) {
         $avg = array_sum($arr) / count($arr);
         $splitter = $tag;
         break;
     }
     $tag_counts = array();
     for ($i = 0; $i < count($age_set) - 1; $i++) {
         $str = substr($this->text, $age_set[$i]['name_match']['pos'], $age_set[$i + 1]['name_match']['pos'] - $age_set[$i]['name_match']['pos']);
         str_ireplace('<' . $splitter, ' ', $str, $count);
         $tag_counts[] = $count;
     }
     sort($tag_counts);
     $ct = $tag_counts[0];
     if (!$ct) {
         return null;
     }
     $post_strlen = 0;
     $info_arr = array();
     for ($i = 0; $i < count($age_set); $i++) {
         $a = $age_set[$i];
         $matches = LsString::striposMulti($this->text, '</' . $splitter, $ct, $a['name_match']['pos']);
         $end = $matches[count($matches) - 1];
         $start = strripos(substr($this->text, 0, $a['name_match']['pos']), '<' . $splitter);
         $str = substr($this->text, $start, $end - $start);
         if ($i == count($age_set) - 1 && count($matches) > 1) {
             $end = $matches[count($matches) - 2];
             $str2 = substr($this->text, $start, $end - $start);
             $avg = strlen(implode(' ', $segments)) / count($segments);
             if (abs(strlen($str2) - $avg) < abs(strlen($str) - $avg)) {
                 $str = $str2;
             }
         }
         $segments[] = $str;
         //$this->printDebug($str);
         $info = $this->parseSegment($str, $a['name_match']['pos'] - $start, $a['name_match']['pos'] - $start + strlen($a['name_match']['match'][2][0]));
         $info = $this->parseBlurb($info, $a);
         //looks to see if bio appears aftr the parsed segment
         if ($i < count($age_set) - 1) {
             $next_start = strripos(substr($this->text, 0, $age_set[$i + 1]['name_match']['pos']), '<' . $splitter);
             $post_str = substr($this->text, $end, $next_start - $end);
         } else {
             $avg = $post_strlen / (count($age_set) - 1);
             $post_str = substr($this->text, $end, $avg);
         }
         $post_strlen += strlen($post_str);
         $post_str = LsHtml::replaceFontStyleTags($post_str);
         $person = $a['name_match']['person'];
         $last = LsString::escapeStringForRegex($person->name_last);
         $info['post_blurb'] = '';
         if (preg_match_all('/>([^<]*' . $last . '[^<]*)</isu', $post_str, $matches)) {
             $post_blurb = implode(' ', $matches[1]);
             $post_blurb = trim(preg_replace('/\\s+/s', ' ', $post_blurb));
             if (strlen($post_blurb) > 40) {
                 $info['post_blurb'] = $post_blurb;
             }
         }
         $info_arr[] = $info;
         //echo "\n\n***\n\n";
     }
     $ct = 0;
     $unv_ct = 0;
     foreach ($info_arr as $info) {
         if (strlen($info['post_blurb']) > strlen($info['blurb'])) {
             $ct++;
         }
         if ($info['img'] == null && $info['unverified_img'] != null) {
             $unv_ct++;
         }
     }
     //if most of the profile segments have images at the end, check to see if they belong to the next profile segment
     if ($unv_ct > count($age_set) - 3) {
         for ($i = 0; $i < count($age_set); $i++) {
             $len = strripos(substr($this->text, 0, $age_set[$i]['name_match']['pos']), '<' . $splitter);
             $tag_start = strripos(substr($this->text, 0, $len), '<img');
             $str = substr($this->text, $tag_start, 200);
             if (preg_match('/^<img[^>]+src=[\'"]([^\'"]+)[\'"]/is', $str, $match) == 1) {
                 $info['img'] = $match[1];
             } else {
                 if ($i == 0) {
                     break;
                 }
             }
         }
     }
     for ($i = 0; $i < count($info_arr); $i++) {
         if ($ct > 0.8 * count($age_set)) {
             $info_arr[$i]['blurb'] = $info_arr[$i]['post_blurb'];
             if (!$info_arr[$i]['since']) {
                 $info_arr[$i]['since'] = $this->getStartDate($info_arr[$i]['blurb']);
             }
         }
         $this->importDirectorInfo($info_arr[$i], $age_set[$i]);
         $this->printDebug("\n***");
     }
     //$this->printDebug($splitter);
     //var_dump($tag_counts);
 }
Ejemplo n.º 11
0
 static function getHtmlPersonNames($text)
 {
     $name_matches = array();
     $re = '/>\\s*\\p{Lu}\'?(\\p{L}+|\\.)?\\s+\\p{Lu}\\.?\\s+\\p{Lu}\\p{L}+(\\,?\\s+\\p{Lu}\\p{L}{1,4}\\.?)?/su';
     $re2 = '/>\\s*(\\p{Lu}\'?(\\p{L}+|\\.)?\\s+(\\p{Lu}\'?(\\s+|\\p{L}+\\s+|\\.\\s*)?){0,2}\\p{Lu}\'?\\p{L}+(\\-\\p{Lu}\'?\\p{L}+)?(\\,?\\s+\\p{Lu}\\p{L}{1,4}\\.?)?)\\**\\s*</su';
     $re3 = '/>\\s*(\\p{Lu}\'?\\p{L}+(\\-\\p{Lu}\'?\\p{L}+)?\\,\\s+(\\p{Lu}\'?(\\p{L}+|\\.)?(\\s+(\\p{Lu}\'?(\\s+|\\p{L}+\\s+|\\.\\s*)?){0,2})?)(\\,?\\s+\\p{Lu}\\p{L}{1,4}\\.?)?)\\**\\s*</su';
     $text = LsHtml::replaceEntities($text);
     $name_matches = array();
     if (preg_match_all($re2, $text, $matches, PREG_OFFSET_CAPTURE)) {
         //LOOP THROUGH MATCHES TO CONFIRM NAMES
         for ($i = 0; $i < count($matches[1]); $i++) {
             $m = $matches[1][$i];
             //echo $m[0] . "\n";
             $is_name = false;
             if (preg_match('/\\s+\\p{Lu}\\.?\\s/', $m[0])) {
                 //echo '  * initial' . "\n";
                 $is_name = true;
             }
             $parts = LsString::split(trim($m[0]));
             //ADD NAME TO MATCH LIST IF IT FITS CONDITIONS
             if (in_array($parts[0], LsLanguage::$commonFirstNames)) {
                 //echo '  * first name' . "\n";
                 $is_name = true;
             }
             $q = LsDoctrineQuery::create()->from('Person p')->where('p.name_first = ?', $parts[0]);
             if ($q->count() > 0) {
                 //echo '  LS name' . "\n";
                 $is_name = true;
             }
             if ($is_name) {
                 $name_matches[] = $m[0];
             }
             /*
                     if ($i != 0)
                     {
                       $beg = $matches[1][$i-1][1];
                       $tweenstr = substr($text,$beg, $m[1] - $beg);
                       //echo '  tag count: ' . LsHtml::tagCount($tweenstr) . "\n";
                     }
                     preg_match('/^[^\s]+\s/su',trim($m[0]),$match);
                     
                     $tags = LsHtml::getSurroundingTags($text,$m[1],3);*/
         }
     }
     if (preg_match_all($re3, $text, $matches, PREG_OFFSET_CAPTURE)) {
         for ($i = 0; $i < count($matches[1]); $i++) {
             $m = $matches[1][$i];
             //echo $m[0] . "\n";
             $person = PersonTable::parseCommaName($m[0]);
             $name_matches[] = $person->getFullName(false);
         }
     }
     return $name_matches;
 }
Ejemplo n.º 12
0
 public function checkBoardPage($board_rels)
 {
     $goog = new LsGoogle();
     $goog->setQuery($this->entity->name . " board");
     $results = $goog->execute();
     $results = $goog->parseSearchResults($results);
     if (count($results)) {
         $url = $results[0]['unescapedUrl'];
         if (stripos($url, "yahoo.com")) {
             $url = $results[1]['unescapedUrl'];
         }
         $this->printDebug($url);
         try {
             if (!$this->browser->get($url)->responseIsError()) {
                 $text = $this->browser->getResponseText();
                 $text = LsHtml::replaceEntities($text);
                 $status_arr = array();
                 $unique_arr = array();
                 foreach ($board_rels as $br) {
                     $found = 0;
                     //$this->printDebug($br->Entity1->getNameRegex());
                     $regexes = $br->Entity1->getNameRegexes();
                     foreach ($regexes as $regex) {
                         if (preg_match_all($regex, $text, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)) {
                             //var_dump($matches);
                             $found = 1;
                             if (!in_array($br->entity1_id, $unique_arr)) {
                                 $unique_arr[] = $br->entity1_id;
                             }
                             break;
                         }
                     }
                     $status_arr[] = $found;
                     $this->printDebug($br->Entity1->name . " > " . $found);
                 }
                 if (count($unique_arr) > 1) {
                     $this->printDebug("\tenough board member names found to mark as current or not");
                     for ($i = 0; $i < count($status_arr); $i++) {
                         $br = $board_rels[$i];
                         if ($status_arr[$i] == 1) {
                             $br->is_current = 1;
                         } else {
                             $br->is_current = 0;
                         }
                         if (!$this->testMode) {
                             $br->save();
                             $br->addReference($url, null, null, $this->entity->name . ' board', null, null);
                         }
                     }
                     return 1;
                 } else {
                     return 0;
                 }
             }
         } catch (Exception $e) {
             return 0;
         }
     } else {
         return -1;
     }
 }
Ejemplo n.º 13
0
 public function getCleanFirstParagraph()
 {
     if ($this->_paragraphs) {
         return null;
     }
     $first = $this->_paragraphs[0];
     $first = LsString::spacesToSpace(LsHtml::replaceEntities(LsHtml::stripTags($first)));
     return $first;
 }
 private function getProxyData($roster, $url, $proxy_year)
 {
     echo "fetching data from proxy at {$url} \n\n";
     $people_count = 0;
     if (!$this->browser->get($url)->responseIsError()) {
         $this->proxyText = $this->browser->getResponseText();
         $this->proxyText = LsHtml::replaceEntities($this->proxyText, ENT_QUOTES, 'UTF-8');
         $this->proxyText = LsString::utf8TransUnaccent($this->proxyText);
         foreach ($roster as &$r) {
             //make sure this is not form 4 data for a corporation, continue to the next if it is
             if ($r['officerTitle'] == '' && $r['isDirector'] != 1 && strtoupper($r['isDirector']) != strtoupper('true')) {
                 continue;
             }
             //echo $re;
             $parts = preg_split("/[\\s|\\.]+/", $r['personName'], -1, PREG_SPLIT_NO_EMPTY);
             //first word, but has to be part of last name because form4 names are in format RUBIN ROBERT E
             $last = trim($parts[0]);
             //sometimes O'LEARY can appear as O LEARY in the form 4
             if (strlen($last) == 1) {
                 $r['personName'] = $last . substr($r['personName'], 2);
                 $parts = preg_split("/[\\s|\\.]+/", $r['personName'], -1, PREG_SPLIT_NO_EMPTY);
                 $last = trim($parts[0]);
             }
             //prepare regex to match occurrences of full name
             //case insensitive to accommodate for various irregularities in names
             $re = LsLanguage::buildLooseNameRegex($r['personName']);
             $offset = 0;
             $found = true;
             //use stripos (much faster than regex) to find occurrences of the first word in the form 4 name (assumed to be part of the last name)
             //needs to be case insensitive
             //continue searching for last name in proxy until a matching full name (proxyName) is found
             while (!isset($r['proxyName']) && $found !== false) {
                 $found = stripos($this->proxyText, $last, $offset);
                 //$this->printDebug('found at pos:' . $found);
                 $offset = $found + 1;
                 if ($found !== false) {
                     $str = substr($this->proxyText, $found - 70, 120);
                     //$this->printDebug('found string: ' . $str);
                     //$this->printDebug($re);
                     preg_match_all($re, $str, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER);
                     //$this->printDebug('matchcount is ' . count($matches));
                     foreach ($matches as $match) {
                         if (stristr($match[1][0], '=')) {
                             continue;
                         }
                         //since we may or may not be working with the full last name, use getLastName to return full last name
                         $new_last = $this->getLastName($r['personName'], $match[1][0]);
                         if ($new_last) {
                             //if last name produced by case insensitive search has no capital letters, not a match
                             if (preg_match('/\\p{Lu}/su', $new_last) == 0) {
                                 continue;
                             }
                             //now that we have a last name, pull the full name from the string
                             $name = LsLanguage::getNameWithLast($match[0][0], $new_last);
                             if ($name) {
                                 $parts = preg_split('/\\s+/isu', $name['nameStart'], -1, PREG_SPLIT_NO_EMPTY);
                                 $non_prefixes = array_diff($parts, PersonTable::$nameParsePrefixes);
                                 //if all we've found are matching prefixes, not a match
                                 if (count($non_prefixes) == 0) {
                                     continue;
                                 } else {
                                     $name1_parts = preg_split('/\\s+/', $r['personName'], -1, PREG_SPLIT_NO_EMPTY);
                                     $ct = 0;
                                     //compatibility check to correct for vagueness of regex
                                     foreach ($non_prefixes as $n) {
                                         foreach ($name1_parts as $p) {
                                             if (stripos($n, $p) === 0 || stripos($p, $n) === 0) {
                                                 $ct++;
                                             }
                                         }
                                     }
                                     //phew -- if name is (somewhat) compatible, assume we've found it
                                     if ($ct > 0) {
                                         $r['proxyUrl'] = $url;
                                         $r['proxyYear'] = $proxy_year;
                                         $r['nameLast'] = trim(LsString::spacesToSpace($name['nameLast']));
                                         $r['proxyName'] = trim(LsString::spacesToSpace($name['nameFull']));
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
         unset($r);
     } else {
         //Error response (eg. 404, 500, etc)
         $log = fopen($this->logFile, 'a');
         fwrite($log, "Couldn't get " . $url . "\n");
         fclose($log);
     }
     return $roster;
 }
Ejemplo n.º 15
0
 public function executeAddBulk($request)
 {
     $this->checkEntity($request, false, false);
     $this->reference_form = new ReferenceForm();
     $this->reference_form->setSelectObject($this->entity);
     $this->add_bulk_form = new AddBulkForm();
     //get possible default categories
     $this->categories = LsDoctrineQuery::create()->select('c.name, c.name')->from('RelationshipCategory c')->orderBy('c.id')->fetchAll(PDO::FETCH_KEY_PAIR);
     array_unshift($this->categories, '');
     if ($request->isMethod('post') && in_array($request->getParameter('commit'), array('Begin', 'Continue'))) {
         if ($request->hasParameter('ref_id')) {
             $this->ref_id = $request->getParameter('ref_id');
         } else {
             $refParams = $request->getParameter('reference');
             $this->reference_form->bind($refParams);
             $restOfParams = (array) $request->getParameterHolder();
             $restOfParams = array_shift($restOfParams);
             $this->add_bulk_form->bind($restOfParams, $request->getFiles());
             if (!$this->reference_form->isValid() || !$this->add_bulk_form->isValid()) {
                 return;
             }
             if ($this->ref_id = $refParams['existing_source']) {
                 $ref = Doctrine::getTable('Reference')->find($this->ref_id);
                 $url = $ref->source;
             } else {
                 $ref = new Reference();
                 $ref->object_model = 'Entity';
                 $ref->object_id = $this->entity->id;
                 $ref->source = $refParams['source'];
                 $ref->name = $refParams['name'];
                 $ref->source_detail = $refParams['source_detail'];
                 $ref->publication_date = $refParams['publication_date'];
                 $ref->save();
             }
             $this->ref_id = $ref->id;
             $this->reference = $ref;
         }
         $verify_method = $request->getParameter('verify_method');
         if ($this->add_method = $request->getParameter('add_method')) {
             if ($this->add_method == 'scrape') {
                 //scrape ref url
                 //set names to confirm
                 $browser = new sfWebBrowser();
                 $entity_types = $request->getParameter('entity_types');
                 //FIND NAMES AT URL USING COMBO OF OPENCALAIS & LS CUSTOM HTML PARSING
                 if (!$browser->get($ref->source)->responseIsError()) {
                     $text = $browser->getResponseText();
                     $this->names = LsTextAnalysis::getHtmlEntityNames($text, $entity_types);
                     $text = LsHtml::findParagraphs($text);
                     $this->text = preg_replace('/<[^b][^>]*>/is', " ", $text);
                     $this->confirm_names = true;
                     return;
                 } else {
                     $request->setError('csv', 'problems finding names at that url');
                 }
             } else {
                 if ($this->add_method == 'upload') {
                     $file = $this->add_bulk_form->getValue('file');
                     $filename = 'uploaded_' . sha1($file->getOriginalName());
                     $extension = $file->getExtension($file->getOriginalExtension());
                     $filePath = sfConfig::get('sf_temp_dir') . '/' . $filename . $extension;
                     $file->save($filePath);
                     if ($filePath) {
                         if ($spreadsheetArr = LsSpreadsheet::parse($filePath)) {
                             $names = $spreadsheetArr['rows'];
                             if (!in_array('name', $spreadsheetArr['headers'])) {
                                 $request->setError('file', 'The file you uploaded could not be parsed properly because there is no "name" column.');
                                 return;
                             }
                             if (in_array('summary', $spreadsheetArr['headers'])) {
                                 foreach ($names as &$name) {
                                     $name['summary'] = str_replace(array('?', "'"), "'", $name['summary']);
                                     $name['summary'] = str_replace(array('?', '?', '"'), '"', $name['summary']);
                                     if (isset($name['title'])) {
                                         $name['description1'] = $name['title'];
                                     }
                                 }
                                 unset($name);
                             }
                         } else {
                             $request->setError('file', 'The file you uploaded could not be parsed properly.');
                             return;
                         }
                     } else {
                         $request->setError('file', 'You need to upload a file.');
                         return;
                     }
                 } else {
                     if ($this->add_method == 'summary') {
                         //parse summary for names
                         $this->text = $this->entity->summary;
                         $entity_types = $request->getParameter('entity_types');
                         $this->names = LsTextAnalysis::getTextEntityNames($this->text, $entity_types);
                         $this->confirm_names = true;
                         return;
                     } else {
                         if ($this->add_method == 'text') {
                             $manual_names = $request->getParameter('manual_names');
                             if ($manual_names && $manual_names != "") {
                                 $manual_names = preg_split('#[\\r\\n]+#', $manual_names);
                                 $manual_names = array_map('trim', $manual_names);
                                 $names = array();
                                 foreach ($manual_names as $name) {
                                     $names[] = array('name' => $name);
                                 }
                             } else {
                                 $request->setError('csv', 'You did not add names properly.');
                                 return;
                             }
                         } else {
                             if ($this->add_method == 'db_search') {
                                 $this->db_search = true;
                             }
                         }
                     }
                 }
             }
         }
         //intermediate scrape page -- takes confirmed names, builds names arr
         if ($confirmed_names = $request->getParameter('confirmed_names')) {
             $restOfParams = (array) $request->getParameterHolder();
             $restOfParams = array_shift($restOfParams);
             $this->add_bulk_form->bind($restOfParams, $request->getFiles());
             if (!$this->add_bulk_form->isValid()) {
                 $this->reference = Doctrine::getTable('reference')->find($this->ref_id);
                 $this->names = unserialize(stripslashes($request->getParameter('names')));
                 $this->confirm_names = true;
                 return;
             }
             $names = array();
             foreach ($confirmed_names as $cn) {
                 $names[] = array('name' => $cn);
             }
             $manual_names = $request->getParameter('manual_names');
             if ($manual_names && $manual_names != "") {
                 $manual_names = preg_split('#[\\r\\n]+#', $manual_names);
                 $manual_names = array_map('trim', $manual_names);
                 foreach ($manual_names as $name) {
                     $names[] = array('name' => $name);
                 }
             }
         }
         // LOAD IN RELATIONSHIP DEFAULTS
         if (isset($verify_method)) {
             $defaults = $request->getParameter('relationship');
             if ($verify_method == 'enmasse') {
                 $this->default_type = $request->getParameter('default_type');
                 $this->order = $request->getParameter('order');
                 $category_name = $request->getParameter('relationship_category_all');
                 $this->extensions = ExtensionDefinitionTable::getByTier(2, $this->default_type);
                 $extensions_arr = array();
                 foreach ($this->extensions as $ext) {
                     $extensions_arr[] = $ext->name;
                 }
             } else {
                 $category_name = $request->getParameter('relationship_category_one');
             }
             if ($category_name) {
                 $this->category_name = $category_name;
                 if (!($category = Doctrine::getTable('RelationshipCategory')->findOneByName($category_name))) {
                     $request->setError('csv', 'You did not select a relationship category.');
                     return;
                 }
                 $formClass = $category_name . 'Form';
                 $categoryForm = new $formClass(new Relationship());
                 $categoryForm->setDefaults($defaults);
                 $this->form_schema = $categoryForm->getFormFieldSchema();
                 if (in_array($category_name, array('Position', 'Education', 'Membership', 'Donation', 'Lobbying', 'Ownership'))) {
                     $this->field_names = array('description1', 'start_date', 'end_date', 'is_current');
                 } else {
                     $this->field_names = array('description1', 'description2', 'start_date', 'end_date', 'is_current');
                 }
                 $extraFields = array('Position' => array('is_board', 'is_executive'), 'Education' => array('degree_id'), 'Donation' => array('amount'), 'Transaction' => array('amount'), 'Lobbying' => array('amount'), 'Ownership' => array('percent_stake', 'shares'));
                 if (isset($extraFields[$category_name])) {
                     $this->field_names = array_merge($this->field_names, $extraFields[$category_name]);
                 }
             }
             $this->matches = array();
             // BOOT TO TOOLBAR OR LOOK FOR MATCHES FOR ENMASSE ADD
             if (isset($names) && count($names) > 0 || isset($this->db_search)) {
                 if ($verify_method == 'onebyone') {
                     if (isset($category_name)) {
                         $defaults['category'] = $category_name;
                     }
                     $toolbar_names = array();
                     foreach ($names as $name) {
                         $toolbar_names[] = $name['name'];
                     }
                     $this->getUser()->setAttribute('toolbar_names', $toolbar_names);
                     $this->getUser()->setAttribute('toolbar_entity', $this->entity->id);
                     $this->getUser()->setAttribute('toolbar_defaults', $defaults);
                     $this->getUser()->setAttribute('toolbar_ref', $this->ref_id);
                     $this->redirect('relationship/toolbar');
                 } else {
                     $this->category_name = $category_name;
                     if (isset($this->db_search)) {
                         $num = $request->getParameter('num', 10);
                         $page = $request->getParameter('page', 1);
                         $q = LsDoctrineQuery::create()->from('Entity e')->where('(e.summary rlike ? or e.blurb rlike ?)', array('[[:<:]]' . $this->entity->name . '[[:>:]]', '[[:<:]]' . $this->entity->name . '[[:>:]]'));
                         foreach ($this->entity->Alias as $alias) {
                             $q->orWhere('(e.summary rlike ? or e.blurb rlike ?)', array('[[:<:]]' . $alias->name . '[[:>:]]', '[[:<:]]' . $alias->name . '[[:>:]]'));
                         }
                         $q->setHydrationMode(Doctrine::HYDRATE_ARRAY);
                         $cat_id = constant('RelationshipTable::' . strtoupper($category_name) . '_CATEGORY');
                         $q->whereParenWrap();
                         $q->andWhere('NOT EXISTS (SELECT DISTINCT l.relationship_id FROM Link l ' . 'WHERE l.entity1_id = e.id AND l.entity2_id = ? AND l.category_id = ?)', array($this->entity['id'], $cat_id));
                         $summary_matches = $q->execute();
                         foreach ($summary_matches as $summary_match) {
                             $aliases = array();
                             foreach ($this->entity->Alias as $alias) {
                                 $aliases[] = LsString::escapeStringForRegex($alias->name);
                             }
                             $aliases = implode("|", $aliases);
                             $summary_match['summary'] = preg_replace('/(' . $aliases . ')/is', '<strong>$1</strong>', $summary_match['summary']);
                             $this->matches[] = array('search_results' => array($summary_match));
                         }
                     } else {
                         for ($i = 0; $i < count($names); $i++) {
                             if (isset($names[$i]['name']) && trim($names[$i]['name']) != '') {
                                 $name = $names[$i]['name'];
                                 $name_terms = $name;
                                 if ($this->default_type == 'Person') {
                                     $name_parts = preg_split('/\\s+/', $name);
                                     if (count($name_parts) > 1) {
                                         $name_terms = PersonTable::nameSearch($name);
                                     }
                                     $terms = $name_terms;
                                     $primary_ext = "Person";
                                 } else {
                                     if ($this->default_type == 'Org') {
                                         $name_terms = OrgTable::nameSearch($name);
                                         $terms = $name_terms;
                                         $primary_ext = "Org";
                                     } else {
                                         $terms = $name_terms;
                                         $primary_ext = null;
                                     }
                                 }
                                 $pager = EntityTable::getSphinxPager($terms, $page = 1, $num = 20, $listIds = null, $aliases = true, $primary_ext);
                                 $match = $names[$i];
                                 $match['search_results'] = $pager->execute();
                                 if (isset($names[$i]['types'])) {
                                     $types = explode(',', $names[$i]['types']);
                                     $types = array_map('trim', $types);
                                     $match['types'] = array();
                                     foreach ($types as $type) {
                                         if (in_array($type, $extensions_arr)) {
                                             $match['types'][] = $type;
                                         }
                                     }
                                 }
                                 $this->matches[] = $match;
                             }
                         }
                     }
                 }
             }
         }
     } else {
         if ($page = $this->getRequestParameter('page')) {
             $this->page = $page;
             $this->num = $this->getRequestParameter('num', 50);
         } else {
             if ($request->isMethod('post') && $request->getParameter('commit') == 'Submit') {
                 $this->ref_id = $this->getRequestParameter('ref_id');
                 $entity_ids = array();
                 $relationship_category = $this->getRequestParameter('category_name');
                 $order = $this->getRequestParameter('order');
                 $default_type = $request->getParameter('default_type');
                 $default_ref = Doctrine::getTable('Reference')->find($request->getParameter('ref_id'));
                 for ($i = 0; $i < $this->getRequestParameter('count'); $i++) {
                     if ($entity_id = $request->getParameter('entity_' . $i)) {
                         $selected_entity_id = null;
                         $relParams = $request->getParameter("relationship_" . $i);
                         if ($relParams['ref_name']) {
                             $ref['source'] = $relParams['ref_source'];
                             $ref['name'] = $relParams['ref_name'];
                         }
                         if ($entity_id == 'new') {
                             $name = $request->getParameter('new_name_' . $i);
                             if ($default_type == 'Person') {
                                 $new_entity = PersonTable::parseFlatName($name);
                             } else {
                                 $new_entity = new Entity();
                                 $new_entity->addExtension('Org');
                                 $new_entity->name = trim($name);
                             }
                             $new_entity->save();
                             $new_entity->blurb = $request->getParameter('new_blurb_' . $i);
                             $new_entity->summary = $request->getParameter('new_summary_' . $i);
                             if (!$ref) {
                                 $ref = $default_ref;
                             }
                             $new_entity->addReference($ref['source'], null, null, $ref['name']);
                             if ($types = $request->getParameter('new_extensions_' . $i)) {
                                 foreach ($types as $type) {
                                     $new_entity->addExtension($type);
                                 }
                             }
                             $new_entity->save();
                             $selected_entity_id = $new_entity->id;
                         } else {
                             if ($entity_id > 0) {
                                 $selected_entity_id = $entity_id;
                                 LsCache::clearEntityCacheById($selected_entity_id);
                             }
                         }
                         if ($selected_entity_id) {
                             $startDate = $relParams['start_date'];
                             $endDate = $relParams['end_date'];
                             unset($relParams['start_date'], $relParams['end_date'], $relParams['ref_name'], $relParams['ref_url']);
                             $rel = new Relationship();
                             $rel->setCategory($relationship_category);
                             if ($order == '1') {
                                 $rel->entity1_id = $this->entity['id'];
                                 $rel->entity2_id = $selected_entity_id;
                             } else {
                                 $rel->entity2_id = $this->entity['id'];
                                 $rel->entity1_id = $selected_entity_id;
                             }
                             //only set dates if valid
                             if ($startDate && preg_match('#^\\d{4}-\\d{2}-\\d{2}$#', Dateable::convertForDb($startDate))) {
                                 $rel->start_date = Dateable::convertForDb($startDate);
                             }
                             if ($endDate && preg_match('#^\\d{4}-\\d{2}-\\d{2}$#', Dateable::convertForDb($endDate))) {
                                 $rel->end_date = Dateable::convertForDb($endDate);
                             }
                             $rel->fromArray($relParams, null, $hydrateCategory = true);
                             if ($request->hasParameter('add_method') && $request->getParameter('add_method') == 'db_search') {
                                 $refs = EntityTable::getSummaryReferences($selected_entity_id);
                                 if (count($refs)) {
                                     $ref = $refs[0];
                                 } else {
                                     $refs = EntityTable::getAllReferencesById($selected_entity_id);
                                     if (count($refs)) {
                                         $ref = $refs[0];
                                     }
                                 }
                             }
                             if (!$ref) {
                                 $ref = $default_ref;
                             }
                             $rel->saveWithRequiredReference(array('source' => $ref['source'], 'name' => $ref['name']));
                             $ref = null;
                         }
                     }
                 }
                 $this->clearCache($this->entity);
                 $this->redirect($this->entity->getInternalUrl());
             } else {
                 if ($request->isMethod('post') && $request->getParameter('commit') == 'Cancel') {
                     $this->redirect($this->entity->getInternalUrl());
                 }
             }
         }
     }
 }