public function executeParseNyDonations($request) { $this->reference_form = new ReferenceForm(); if ($request->isMethod('post')) { $refParams = $request->getParameter('reference'); $url = $refParams['source']; $this->reference_form->bind($refParams); $commit = $request->getParameter('commit'); if (preg_match('/^http...www.elections.state.ny.us.8080.plsql_browser.*$/is', $url, $match)) { $browser = new sfWebBrowser(); if (!$browser->get($url)->responseIsError()) { $text = $browser->getResponseText(); $text = LsHtml::parseNyDonations($text); $this->parsed_text = $text; } } } }
private function findPersonBio($page, $person, $org) { //$this->printDebug(''); $name_re = LsString::escapeStringForRegex($person->name_last); if (preg_match('/<title>([^<]*)<\\/title>/is', $page, $match)) { if (stristr($match[1], $person->name_last) && stristr($match[1], $person->name_first) && strlen($person->name_first) > 2) { $name_re .= '|' . LsString::escapeStringForRegex($person->name_first); } } $layout_tags = implode('|', LsHtml::$layoutTags); $re2 = '/>([^<]*?(' . $name_re . ')(\\s|,|<)(.*?))<\\/?(' . $layout_tags . ')/is'; $re = $re2 . 'u'; //$this->printDebug($re); $bio_match = null; if (preg_match_all($re, $page, $matches) || preg_match_all($re2, $page, $matches)) { //$this->printDebug('matches found'); $arr = array(); $most_reqs = 0; $qual = false; $news = false; foreach ($matches[1] as $match) { if (stristr($match, '}') || stristr($match, '{') || preg_match('/\\svar\\s/is', $match)) { //$this->printDebug('FAILED - curly brackets'); continue; } $str = LsHtml::replaceEntities($match); $str = LsHtml::stripTags($str, ''); $str = trim(LsString::spacesToSpace($str)); $this->printDebug(strlen($str)); if (strlen($str) > 3000) { $this->printDebug('FAILED - str too long'); continue; } if (preg_match('/(^|\\b)(' . $name_re . ')\\b/is', $str) == 0) { $this->printDebug($match . 'FAILED - no name match'); continue; } $word_count = count(explode(' ', $str)); if ($word_count < 12) { $this->printDebug('FAILED - str not long enough'); continue; } else { if (stristr($str, 'announce') || stristr($str, 'today') || stristr($str, '—') || stristr($str, '–') || preg_match('/^[^\\-]{0,100}\\-(\\-|\\s)/is', $str)) { $news = true; $this->printDebug('FAILED: dash / announced / today'); } else { if (preg_match('/(^|\\s)([\'"”])([^\\1]+)\\1/is', $str, $qm) && count(explode(' ', $qm[0])) > 6) { $news = true; $this->printDebug('FAILED: quote'); } else { if (preg_match_all('/\\s(\\p{Ll})+\\b/su', $str, $lcm) < 5) { $this->printDebug('FAILED: not enough lowercase'); } else { $bio_words = PersonTable::$commonBioWords; if (in_array('Lobbyist', $person->getExtensions())) { $bio_words = array_merge($bio_words, LobbyistTable::$commonBioWords); } $bio_words = implode('|', $bio_words); $bio_word_ct = preg_match_all('/\\s(' . $bio_words . ')\\s/is', $str, $matches); $str = trim($str); if (preg_match('/\\.$/is', $str) == 0) { $this->printDebug('no period at end of string'); } else { if ($bio_word_ct > 1) { $news = false; $qual = true; $arr[] = $str; } else { $this->printDebug('less than 2 bio words'); if ($news == false) { $str = preg_replace('/^[\\,\\.\\:\\;]\\s*/su', '', $str); $arr[] = $str; //array('str' => $str, 'bio_words' => $bio_word_ct); } } } } } } //$this->printDebug(''); } } if ($qual) { $arr = array_unique($arr); $ret = false; $bio = implode("\n\n", $arr); //$this->printDebug($name_re); if (strlen($bio) < 3000 && LsString::withinN($bio, '(' . $name_re . ')', '(is|was|holds|led|has|had|provides|practices|served|leads)', 2)) { if (preg_match('/^.*?\\b(' . $name_re . ')\\b/is', $bio, $m) && count(explode(' ', $m[0])) < 20) { $ret = true; $this->printDebug('SUCCESS'); } } else { $this->printDebug('within N failed !!!!'); } $org_test = true; if ($ret && stristr($org->name, $person->name_last)) { $org_test = false; if (strlen($person->name_first) > 1) { if (preg_match('/([^\\s]+\\s+){0,14}/is', $arr[0], $beg_match)) { $nf_re = LsString::escapeStringForRegex($person->name_first); if (preg_match('/\\b' . $nf_re . '\\b/is', $beg_match[0]) || preg_match('/\\b(Mr|Mrs|Ms)\\b/su', $arr[0])) { $org_test = true; //$this->printDebug('PASSED FIRST NAME TEST'); } } } else { if (preg_match('/\\b(he|she|him|her|his|mr|ms|mrs)\\b/is', $arr[0])) { $org_test = true; //$this->printDebug('PASSED POSSESSIVE TEST'); } } } if ($ret && $org_test) { return $bio; } } } else { $this->printDebug('no matches found'); } return false; }
protected function importGovernor($row) { $url = $this->_baseUrl . $row['url']; if (!$this->browser->get($url)->responseIsError()) { $text = $this->browser->getResponseText(); $text = LsHtml::replaceEntities($text); //preg_match('/>Family\:<\/b>([^<]*)<br/is',$text,$family_arr); $name = trim(str_ireplace('Gov.', '', $row['name'])); $this->printDebug(''); $this->printDebug($name . ':'); $governor = PersonTable::parseFlatName($name); $governor->addExtension('PoliticalCandidate'); $governor->addExtension('ElectedRepresentative'); $governor->is_state = 1; $similar = $governor->getSimilarEntitiesQuery(true)->execute(); foreach ($similar as $s) { $sim_re = LsString::escapeStringForRegex($s->name_first); $search_re = LsString::escapeStringForRegex($governor->name_first); if (preg_match('/^' . $sim_re . '/su', $governor->name_first) == 0 && preg_match('/^' . $search_re . '/su', $s->name_first) == 0) { continue; } $bio = $s->getExtendedBio(); if (preg_match('/\\bgovernor(ship)?\\b/isu', $bio)) { $governor = $s; $this->printDebug(' Found existing governor: ' . $s->name . ' ' . $s->id); break; } } $governor->save(); $this->printDebug($governor->id); if (!$governor->start_date && preg_match('/>Born\\:<\\/b>([^<]*)<br/is', $text, $birth_arr)) { $this->printDebug(' Birthdate: ' . $birth_arr[1]); $governor->start_date = trim($birth_arr[1]); } if (!$governor->birthplace && preg_match('/>Birth State\\:<\\/b>([^<]*)<br/is', $text, $birth_state_arr)) { $this->printDebug(' Birthplace: ' . trim($birth_state_arr[1])); $governor->birthplace = trim($birth_state_arr[1]); } //PARTY MEMBERSHIP if (preg_match('/>Party\\:<\\/b>([^<]*)<br/is', $text, $party_arr)) { $party_str = $party_arr[1]; $this->printDebug(' Party: ' . $party_str); if (stristr($party_str, 'Democrat')) { $party = EntityTable::getByExtensionQuery('PoliticalParty')->addWhere('name = ?', 'Democratic Party')->fetchOne(); } if (stristr($party_str, 'Republican')) { $party = EntityTable::getByExtensionQuery('PoliticalParty')->addWhere('name = ?', 'Republican Party')->fetchOne(); } if (isset($party) && $party && !$governor->party_id) { $governor->Party = $party; $governor->is_independent = false; $this->printDebug(' Added membership in ' . $party); } else { if (stristr($party_str, 'Independent')) { $governor->is_independent = true; } } } if (!$governor->summary && preg_match_all('/>([^<]{240,})/isu', $text, $bio_match)) { $str = ''; foreach ($bio_match[1] as $b) { if (!stristr($b, 'Javascript')) { $str .= "\n\n" . $b; } } $str = trim($str); if (strlen($str)) { $governor->summary = $str; } } $governor->save(); $governor->addReference($url, null, $governor->getAllModifiedFields(), 'Governors Association'); //SCHOOLS if (preg_match('/>School\\(s\\)\\:<\\/b>([^<]*)<br/is', $text, $school_arr)) { $school_names = explode(';', trim($school_arr[1])); if (count($school_names) == 1) { $school_names = explode(',', $school_names[0]); } foreach ($school_names as $school_name) { $school_name = trim($school_name); if (!($school = EntityTable::getByExtensionQuery('School')->leftJoin('e.Alias a')->addWhere('e.name = ? or a.name = ?', array($school_name, $school_name))->fetchOne())) { $school = new Entity(); $school->addExtension('Org'); $school->addExtension('School'); $school->name = $school_name; $school->save(); $this->printDebug(' Added School: ' . $school_name); } $q = RelationshipTable::getByCategoryQuery('Education')->addWhere('entity1_id = ? and entity2_id = ?', array($governor->id, $school->id))->fetchOne(); if (!$q) { $relationship = new Relationship(); $relationship->setCategory('Education'); $relationship->Entity1 = $governor; $relationship->Entity2 = $school; $relationship->is_current = 0; $relationship->save(); $relationship->addReference($url, null, $relationship->getAllModifiedFields(), 'Governors Association'); $this->printDebug(' Added education: ' . $relationship->name); } } } //GOVERNOR OFFICE AND POSITION $office_name = 'Office of the Governor of ' . $row['state']; if (!($office = EntityTable::getByExtensionQuery('GovernmentBody')->addWhere('name = ?', $office_name)->fetchOne())) { $office = new Entity(); $office->name = $office_name; $office->addExtension('Org'); $office->addExtension('GovernmentBody'); $state = Doctrine::getTable('AddressState')->findOneByName($row['state']); if ($state) { $office->state_id = $state->id; } $office->save(); $office->addReference($url, null, $office->getAllModifiedFields(), 'Governors Association'); $this->printDebug(' Added office: ' . $office->name); } $q = RelationshipTable::getByCategoryQuery('Position')->addWhere('entity1_id = ? and entity2_id = ? and description1 = ?', array($governor->id, $office->id, 'Governor'))->fetchOne(); if (!$q) { sort($row['years']); $i = 0; while ($i < count($row['years'])) { $governorship = new Relationship(); $governorship->setCategory('Position'); $governorship->Entity1 = $governor; $governorship->Entity2 = $office; $governorship->description1 = 'Governor'; $governorship->start_date = $row['years'][$i]; $i++; if (isset($row['years'][$i])) { $governorship->end_date = $row['years'][$i]; $governorship->is_current = 0; if (!$governor->blurb && !isset($row['years'][$i + 1])) { $governor->blurb = 'Former Governor of ' . $row['state']; } } else { $governorship->is_current = 1; if (!$governor->blurb) { $governor->blurb = 'Governor of ' . $row['state']; } } $governor->save(); $i++; $governorship->save(); $governorship->addReference($url, null, $governorship->getAllModifiedFields(), 'Governors Association'); $this->printDebug(' Added governorship: ' . $governorship->name); } } //SPOUSE if (preg_match('/>Spouse\\:<\\/b>(.*?)<br/is', $text, $spouse_arr)) { $spouse = trim(LsHtml::stripTags($spouse_arr[1])); $q = RelationshipTable::getByCategoryQuery('Family')->addWhere('entity1_id = ? or entity2_id = ?', array($governor->id, $governor->id))->fetchOne(); if (!$q && strlen($spouse)) { $spouse = PersonTable::parseFlatName($spouse); $spouse->save(); $this->printDebug(' Added spouse: ' . $spouse->name); $relationship = new Relationship(); $relationship->setCategory('Family'); $relationship->Entity1 = $spouse; $relationship->Entity2 = $governor; $relationship->description1 = 'Spouse'; $relationship->description2 = 'Spouse'; $relationship->save(); $relationship->addReference($url, null, $relationship->getAllModifiedFields(), 'Governors Association'); $this->printDebug(' Added spouse relationship: ' . $relationship->name); } } //ADDRESS --not working, malformed addresses /* if (preg_match('/>Address\:\s*<\/b>(.*?)<b>/is',$text,$address_arr)) { $address = trim(str_replace('<br/>',', ',$address_arr[1])); $this->printDebug($address); if ($governor->Address->count() == 0 && $a = $governor->addAddress($address)) { $this->printDebug(' Address: ' . $a); $governor->save(); } }*/ //PHONE NUMBER if (preg_match('/>Phone\\(s\\)\\:<\\/b>([^<]*)<br/is', $text, $phone_arr)) { $phone_number = trim($phone_arr[1]); if (!$governor->Phone->count()) { $phone = $governor->addPhone($phone_number); $this->printDebug(' Phone: ' . $phone); } } if (!$governor->Image->count() && preg_match('/<img .*?class\\="display" src\\="([^"]*)"/is', $text, $img_arr)) { $url = $img_arr[1]; try { $fileName = ImageTable::createFiles($url, $governor->name_first); } catch (Exception $e) { $fileName = null; } if ($fileName) { //insert image record $image = new Image(); $image->filename = $fileName; $image->entity_id = $governor->id; $image->title = $governor->name; $image->caption = 'From Governors Association website'; $image->is_featured = true; $image->is_free = false; $image->url = $url; $image->save(); $this->printDebug("Imported image: " . $image->filename); } } } }
/** * get donor info */ private function generateDonor($text) { $text_arr = explode("<BR>", $text); //var_dump($text_arr[0]); $donor = $this->generatePerson(LsHtml::stripTags($text_arr[0], '')); $address_arr = LsLanguage::parseCityStatePostal($text_arr[1]); $a = new Address(); $a->street1 = isset($address_arr['street1']) ? $address_arr['street1'] : null; $a->street2 = isset($address_arr['street2']) ? $address_arr['street2'] : null; $a->city = $address_arr['city']; if ($state = AddressStateTable::retrieveByText($address_arr['state'])) { $a->State = $state; } $a->postal = $address_arr['zip']; $donor->addAddress($a); $donor->summary = strip_tags(trim($text_arr[2])); return $donor; }
public function getSummary($str, Entity $e) { $str = LsHtml::replaceEntities($str); $name_re = array(); $name_re[] = $e->getNameRegex(); if ($e->name_nick && $e->name_nick != '') { $name_re[] = LsString::escapeStringForRegex($e->name_nick); } $name_re = implode('|', $name_re); $style_tags = implode('|', LsHtml::$fontStyleTags); $layout_tags = implode('|', LsHtml::$layoutTags); $re = '/((' . $name_re . ')(.*?))<\\/?(' . $layout_tags . ')/isu'; $this->printDebug($re); $results = null; if (preg_match_all($re, $str, $matches)) { $results = $matches[1]; foreach ($results as $result) { $result = LsString::spacesToSpace(LsHtml::stripTags($result)); $this->printDebug($result); } } return $results; }
function getBusinessWeek(Entity $person) { /* $yahoo = new LsYahoo; $yahoo->setService('Web Search'); $yahoo->setSite('http://investing.businessweek.com'); $yahoo->setQuery($person->name); $this->printDebug($yahoo->getQueryUrl()); $yahoo->execute(); $results = $yahoo->getResults(); */ $google_scraper = new LsGoogle(); $google_scraper->setQuery('site:investing.businessweek.com ' . $person->name); $this->printDebug('site:investing.businessweek.com ' . $person->name); $google_scraper->execute(); if (!$google_scraper->getNumResults()) { return null; } $results = $google_scraper->getResults(); $businessweek_profile = null; foreach ($results as $result) { $this->printDebug($result->unescapedUrl); if (preg_match('/^.*?person\\.asp\\?personId=\\d+/is', $result->unescapedUrl, $match)) { $businessweek_profile = $match[0]; break; } } if (!$businessweek_profile) { foreach ($results as $result) { $url = $result->unescapedUrl; if (preg_match('/^(.*?)\\&/is', $url, $match)) { $url = $match[1]; } if (!stristr($url, 'http://')) { $url = 'http://investing.businessweek.com/' . $url; } $this->printDebug('new url: ' . $url); if (!$this->browser->get($url)->responseIsError()) { $text = $this->browser->getResponseText(); //var_dump($text); $links = LsHtml::matchLinks($text); foreach ($links as $link) { if (preg_match('/' . $person->getNameRegex(true) . '/s', $link['text']) && preg_match('/^.*?person\\.asp\\?personId=\\d+/is', $link['url'], $match)) { $url = $match[0]; if (!stristr($url, 'http://')) { $url = 'http://investing.businessweek.com/' . $url; } $businessweek_profile = $url; break; } } if ($businessweek_profile) { $this->printDebug('Businessweek profile found on 2nd attempt: ' . $businessweek_profile); break; } } } if (!$businessweek_profile) { $this->printDebug('Buisnessweek profile not found'); return; } } $education_found = false; $employment_found = false; $summary_found = false; $ed_matched = false; //go to businessweek profile and get education $this->browser->get($businessweek_profile); if ($text = $this->browser->getResponseText()) { //$education = null; //$employment = null; if (preg_match('#EDUCATION[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<h2#is', $text, $education)) { $ed_matched = preg_match_all('/<strong>(.+?)<\\/strong>\\s*(\\d{4})?\\s*<\\/div><div.*?>(.+?)</s', $education[1], $education_found); } if (preg_match('#OTHER AFFILIATIONS[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<\\/td#s', $text, $employment)) { preg_match_all('#href\\=\\".+?\\"\\>(.+?)\\<\\/a\\>#is', $employment[1], $employment_found); } preg_match('#BACKGROUND[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<\\/p>#s', $text, $summary_found); $summary_found = strip_tags($summary_found[1]); //var_dump($summary_found); if ($ed_matched) { $this->printDebug('Education info found at Businessweek'); } else { $this->printDebug('Education info not found at Businessweek'); return; } } else { $this->printDebug('Businessweek browser error'); return; } $education_history = null; $employment_history = null; $wikipedia = new LsWikipedia(); $wikipedia->request($person->name); $wikipedia->execute(); $plaintext = $wikipedia->getPlainText(); foreach ($education_found[3] as $key => $institution) { $arr = null; $arr['institution'] = $institution; $arr['degree'] = $education_found[1][$key]; $arr['year'] = null; if ($education_found[2][$key] != '') { $arr['year'] = $education_found[2][$key]; } $wikipedia_matches = LsLanguage::getCommonPronouns($arr['institution'], $plaintext, array_merge(LsLanguage::$business, LsLanguage::$schools, LsLanguage::$grammar)); if ($wikipedia_matches) { $arr['source'] = 'http://en.wikipedia.org/wiki/' . str_replace('+', '_', $wikipedia->getTitle()); } else { $arr['source'] = $businessweek_profile; } $education_history[] = (object) $arr; } foreach ($employment_found[1] as $key => $company) { $arr = null; $arr['company'] = $company; $arr['title'] = null; $employment_history[] = (object) $arr; } $possible_person = array('name' => $person->name, 'summary' => $summary_found, 'employment_history' => (object) $employment_history, 'education' => (object) $education_history); $possible_persons[] = (object) $possible_person; $this->import($person, $possible_persons); }
public function parseResults($match) { if (isset($match['bio'])) { $bio_dirty = LsHtml::replaceEntities(LsString::spacesToSpace(LsHtml::stripTags($match['bio'], "; "))); $bio_dirty = preg_replace('/(\\;\\s)+/is', '; ', $bio_dirty); } foreach ($match as $k => &$m) { $m = LsHtml::replaceEntities(LsString::spacesToSpace(LsHtml::stripTags($m, " "))); } if (isset($match['name'])) { $name = $match['name']; $bio = ''; if (isset($match['bio'])) { $bio = $match['bio']; } } else { return; } $this->printDebug("_________________________\n\nname: " . $name . "\n"); $this->printDebug("bio: " . $bio . "\n"); $accept = strtolower($this->readline('Process this entity? (n to skip) ')); if ($accept == 'n' || $accept == 'no') { return false; } if (!$this->org_org) { if ($this->last_first) { $entity = PersonTable::parseCommaName($name); } else { $entity = PersonTable::parseFlatName($name); } $similar_entities = PersonTable::getSimilarQuery2($entity)->execute(); } else { $entity = new Entity(); $entity->addExtension('Org'); foreach ($this->org_extensions as $ext) { $entity->addExtension($ext); } $entity->setEntityField('name', $name); $name = trim($name); $name = str_replace('.', '', $name); $similar_entities = OrgTable::getSimilarQuery($entity)->execute(); } $matched = false; foreach ($similar_entities as $similar_entity) { if ($similar_entity['primary_ext'] == 'Person') { $this->printDebug(' POSSIBLE MATCH: ' . $similar_entity->name . ' (Orgs :: ' . $similar_entity->getRelatedOrgsSummary() . " Bio :: {$similar_entity->summary})"); } else { $this->printDebug(' POSSIBLE MATCH: ' . $similar_entity->name . ' (Summary :: ' . $similar_entity->summary . ')'); } $accept = $this->readline(' Is this the same entity? (y or n)'); $attempts = 1; while ($accept != 'y' && $accept != 'n' && $attempts < 5) { $accept = $this->readline(' Is this the same entity? (y or n) '); $attempts++; } if ($accept == 'y') { $entity = $similar_entity; $matched = true; $this->printDebug(' [accepted]'); //sleep(1); break; } else { if ($accept == 'break') { break; } } } $created = false; if (!$matched) { if ($entity->getPrimaryExtension() == 'Person') { $this->printDebug(' New person: ' . $entity->name_first . ' ' . $entity->name_last); } else { $this->printDebug(' New org: ' . $entity->name); } $accept = $this->readline(' create this new entity? (y or n) '); $attempts = 1; while ($accept != 'y' && $accept != 'n' && $attempts < 5) { $accept = $this->readline(' create this new entity? (y or n) '); $attempts++; } if ($accept == 'y') { if ($entity->getPrimaryExtension() == 'Person') { $this->printDebug("\n Bio: {$bio} \n"); $accept = $this->readline(' Add this bio? (y or n) '); $attempts = 1; while ($accept != 'y' && $accept != 'n' && $attempts < 5) { $accept = $this->readline(' add this bio? (y or n) '); $attempts++; } if ($accept == 'y') { $entity->summary = $bio; } } $entity->save(); $entity->addReference($this->url, null, null, $this->url_name); $created = true; $this->printDebug(' ' . $entity->name . ' saved'); //sleep(1); } } if (($matched || $created) && $entity->getPrimaryExtension() == 'Person') { $accept = $this->readline("Parse above bio for possible relationships? (y or n) "); $attempts = 1; while ($accept != 'y' && $accept != 'n' && $attempts < 5) { $accept = $this->readline("Parse above bio for possible relationships? (y or n) "); $attempts++; } if ($accept == 'y') { $names = $entity->parseBio($bio_dirty); $this->printDebug(" Orgs that {$entity} has a position at?"); foreach ($names as $name) { $exists = false; $name = trim($name); $accept = $this->readline(" > {$name} :: an org? (y or n or b to break) "); $attempts = 1; $accept = strtolower($accept); while ($accept != 'y' && $accept != 'n' && $accept != 'b' && $attempts < 5) { $accept = $this->readline(" {$name} :: an org? (y or n or b to break) "); $accept = strtolower($accept); $attempts++; } if ($accept == 'b') { break; } else { if ($accept == 'y') { $this->printDebug(' .....looking for names.....'); $orgs = EntityTable::getByExtensionAndNameQuery('Org', $name)->limit(10)->execute(); $related_org = null; foreach ($orgs as $org) { $q = LsDoctrineQuery::create()->from('Relationship r')->where('entity1_id = ? and entity2_id = ?', array($entity->id, $org->id))->fetchOne(); if ($q) { $this->printDebug(' Position already exists, skipping...'); $exists = true; break; } $accept = $this->readline(" Create a position relationship between {$entity->name} and {$org->name}? (y or n) "); $attempts = 1; while ($accept != 'y' && $accept != 'n' && $attempts < 5) { $accept = $this->readline(" Create a position relationship between {$entity->name} and {$org->name}? (y or n) "); $attempts++; } if ($accept == 'y') { $related_org = $org; break; } } if (!$related_org && !$exists) { $accept = $this->readline(" couldn't find org, should this one be created: {$name} (y or n) "); while ($accept != 'y' && $accept != 'n' && $attempts < 5) { $accept = $this->readline(" couldn't find org, should this one be created: {$name} (y or n) "); $attempts++; } if ($accept == 'y') { $related_org = new Entity(); $related_org->addExtension('Org'); $related_org->name = preg_replace('/\\.(?!com)/i', '', $name); $extensions = $this->readline(" what extensions should this org get? (eg 'Business, LobbyingFirm, LawFirm') "); $extensions = preg_split('/\\,\\s*/isu', $extensions, -1, PREG_SPLIT_NO_EMPTY); try { foreach ($extensions as $extension) { $related_org->addExtension($extension); } $related_org->save(); $related_org->addReference($this->url, null, null, $this->url_name); } catch (Exception $e) { $this->printDebug(' !!! problems with org creation, skipping'); $related_org = null; } } } if ($related_org) { $q = LsDoctrineQuery::create()->from('Relationship r')->where('r.entity1_id = ? and r.entity2_id = ? and r.category_id = ?', array($entity->id, $related_org->id, 1))->fetchOne(); if ($q) { $this->printDebug(' (relationship already found, skipping...)'); continue; } $relationship = new Relationship(); $relationship->Entity1 = $entity; $relationship->Entity2 = $related_org; $relationship->setCategory('Position'); $title = $this->readline(" Title for this position relationship? (<enter> to skip) "); if (strlen($title) > 2) { $relationship->description1 = $title; } $current = strtolower($this->readline(" Is the relationship current? (y or n or <enter> to skip) ")); if (in_array($current, array('y', 'yes'))) { $relationship->is_current = 1; } else { if (in_array($current, array('n', 'no'))) { $relationship->is_current = 0; } } $board = strtolower($this->readline(" Is the relationship a board position? (y or n or <enter> to skip) ")); if (in_array($board, array('y', 'yes'))) { $relationship->is_board = 1; } else { if (in_array($board, array('n', 'no'))) { $relationship->is_board = 0; } } $relationship->save(); $relationship->addReference($this->url, null, null, $this->url_name); $this->printDebug(" Relationship saved: {$relationship}"); } } } } } } if ($matched || $created) { if ($this->list) { $q = LsDoctrineQuery::create()->from('LsListEntity l')->where('l.entity_id = ? and l.list_id = ?', array($entity->id, $this->list->id))->fetchOne(); if (!$q) { $le = new LsListEntity(); $le->Entity = $entity; $le->LsList = $this->list; if (isset($match['rank'])) { if (preg_match('/(\\d+)/isu', $match['rank'], $m)) { $le->rank = $m[1]; } } $le->save(); $this->printDebug('List membership saved'); } } if ($this->org) { $q = LsDoctrineQuery::create()->from('Relationship r')->where('r.entity1_id = ? and r.entity2_id = ? and r.category_id = ?', array($entity->id, $this->org->id, 1))->fetchOne(); if ($q) { $this->printDebug(' (relationship already found, skipping...)'); return; } $relationship = new Relationship(); $relationship->Entity1 = $entity; $relationship->Entity2 = $this->org; $relationship->setCategory($this->relationship_category); if ($this->description1) { $relationship->description1 = $this->description1; } else { $description = $this->readline(" what description to give this relationship ({$relationship}) ? (less than 3 chars will skip)"); if (strlen($description) > 2) { $relationship->description1 = $description; } } if ($this->relationship_category == 'Position') { $relationship->is_board = $this->is_board; } else { if ($this->relationship_category == 'Donation') { if ($this->amount) { $relationship->amount = $this->amount; } else { $amount = $this->readline(" what amount ({$relationship}) ? (less than 3 chars will skip)"); if (strlen($amount) > 1) { $relationship->amount = $amount; } } } } $relationship->save(); $relationship->addReference($this->url, null, null, $this->url_name); $this->printDebug(" Relationship saved: {$relationship}"); } } //dump history if (isset($match['affiliation1'])) { $affiliation = $match['affiliation']; //$this->printDebug($affiliation); } }
public function __construct($row) { $this->id = $row[1]; $this->name = trim(LsHtml::replaceEntities($row[2])); $this->lifespan = trim(LsHtml::replaceEntities($row[3])); $this->type = trim($row[4]); $this->party = trim($row[5]); $this->state = trim($row[6]); list($this->termStart, $this->termEnd) = explode('-', trim($row[7])); }
function __construct($text) { $text = LsHtml::replaceEntities($text); $text = LsString::utf8TransUnaccent($text); $this->text = $text; }
private function findBasicInfo() { if (!$this->sets) { return null; } $re = '/^([^<]*?<[^>]*>)*?[^<]*?(?<!([\\.,$\\/]))(\\b[2-9]\\d\\b)(?!((,\\s+200\\d|199\\d)|%|[,\\.]\\d|[-\\s]+([Yy]ears?\\s+(with|career)|[Dd]ays?|[Mm]onths?)\\b))/su'; $age_match_sets = array(); //go through the sets of name matches and find age matches for each foreach ($this->sets as $set) { $age_matches = array(); for ($i = 0; $i < count($set); $i++) { $len = $i == count($set) - 1 ? 2000 : $set[$i + 1]['pos'] - $set[$i]['pos']; if ($len > 100000) { continue; } $str = substr($this->text, $set[$i]['pos'], $len); if (preg_match($re, $str, $match)) { $n = preg_match_all('/<(\\p{L}+)[^>]*>/s', $match[0], $m, PREG_SET_ORDER); $tag = 'empty'; if ($n > 0) { $tag = $m[count($m) - 1][1]; } $stripped = LsHtml::stripTags($match[0]); if (strlen($stripped) < 2000) { $age_matches[] = array('ind' => $i, 'age_match' => $match, 'age' => $match[3], 'name_match' => $set[$i], 'num_tags' => $n, 'tag' => $tag, 'len' => strlen($match[0])); } //$this->printDebug($i . '. ' . $set[$i]['name'] . ' : ' . $match[3] . ' : ' . strlen($match[0]) . ' : ' . $n . ' : ' . $tag); //$this->printDebug($set[$i]['match'][1][0]); } //else $this->printDebug('--'); //$this->printDebug($set[$i]['match'][1][0]); } $this->printDebug('count age matches is ' . count($age_matches)); $age_match_sets[] = $age_matches; } //find the best set (most unique names and ages) $max = 0; $best = array(array('unique' => array(), 'set' => array())); foreach ($age_match_sets as $age_matches) { if (count($age_matches) < 2) { continue; } $unique = array($age_matches[0]['name_match']['id']); $temp = array($age_matches[0]); for ($i = 1; $i < count($age_matches); $i++) { if ($age_matches[$i]['ind'] - 4 <= $age_matches[$i - 1]['ind']) { $temp[] = $age_matches[$i]; if (!in_array($age_matches[$i]['name_match']['id'], $unique)) { $unique[] = $age_matches[$i]['name_match']['id']; } } else { if (count($unique) > $max) { $max = count($unique); if (count(array_intersect($best[0]['unique'], $unique)) == 0 && count($best[0]['unique']) > 2) { array_unshift($best, array('unique' => $unique, 'set' => $temp)); } else { $best = array(array('unique' => $unique, 'set' => $temp)); } } else { if (count(array_intersect($best[0]['unique'], $unique)) == 0 && count($unique) > 2) { $best[] = array('unique' => $unique, 'set' => $temp); } } $unique = array($age_matches[$i]['name_match']['id']); $temp = array($age_matches[$i]); } } if (count($unique) > $max) { $max = count($unique); if (count(array_intersect($best[0]['unique'], $unique)) == 0) { array_unshift($best, array('unique' => $unique, 'set' => $temp)); } else { $best = array(array('unique' => $unique, 'set' => $temp)); } } } $best = $best[0]['set']; //$this->printDebug('count best is ' . count($best)); //find the tag all names have in common (if there is one) $tag_counts = array(); foreach ($best as $b) { if (isset($tag_counts[$b['tag']])) { $tag_counts[$b['tag']]++; } else { $tag_counts[$b['tag']] = 1; } $this->printDebug($b['ind'] . '. ' . $b['name_match']['name'] . ' : ' . $b['age'] . ' : ' . strlen($b['age_match'][0]) . ' : ' . $b['num_tags'] . ' : ' . $b['tag']); } $tag = null; foreach ($tag_counts as $k => $v) { if ($v > 0.8 * count($best)) { $tag = $k; break; } } $age_set = array(); if ($tag) { foreach ($best as $b) { if ($b['tag'] == $tag) { $age_set[] = $b; } } } else { $age_set = $best; } $age_set = LsArray::multiSort($age_set, array('name_match', 'id')); //find duplicates and determine the best match out of the pair/set $singles = array(); $doubles = array(); $num_tags = 0; $len = 0; for ($i = 0; $i < count($age_set); $i++) { $double = array($age_set[$i]); while ($i < count($age_set) - 1 && $double[0]['name_match']['id'] == $age_set[$i + 1]['name_match']['id']) { $double[] = $age_set[$i + 1]; $i++; } if (count($double) == 1) { $singles[] = $age_set[$i]; $num_tags += $age_set[$i]['num_tags']; $len += $age_set[$i]['len']; } else { $doubles[] = $double; } } if (count($singles) < 3) { $unique = array(); $sets = array(array()); $age_set = LsArray::multiSort($age_set, array('name_match', 'pos')); foreach ($age_set as $a) { //$this->printDebug($a['name_match']['name'] . ": "); if (!in_array($a['name_match']['id'], $unique)) { $unique[] = $a['name_match']['id']; $sets[count($sets) - 1][] = $a; } else { $unique = array($a['name_match']['id']); $sets[] = array($a); } } $age_set = $sets[0]; } else { $avg_len = $len / count($singles); $avg_tags = $num_tags / count($singles); //$this->printDebug('len is ' . $avg_len . ' and tags is ' . $avg_tags); foreach ($doubles as $double) { $best = null; foreach ($double as $d) { $lf = $d['len'] / $avg_len; $tf = $d['num_tags'] / $avg_tags; $f = abs(2 - ($lf + $tf)); if (!$best) { $best = $d; } else { if (abs($avg_tags - $best['num_tags']) > abs($avg_tags - $d['num_tags'])) { $best = $d; } else { if (abs($avg_tags - $best['num_tags']) == abs($avg_tags - $d['num_tags']) && abs($avg_len - $best['len']) == abs($avg_len - $d['len'])) { $best = $d; } } } } $singles[] = $best; } $age_set = LsArray::multiSort($singles, array('name_match', 'pos')); } //determine which directors were found, which weren't $ids = array(); foreach ($age_set as $a) { $ids[] = $a['name_match']['id']; //$this->printDebug($a['ind'] . '. ' . $a['name_match']['name'] . ' : ' . $a['age'] . ' : ' . strlen($a['age_match'][0]) . ' : ' . $a['num_tags'] . ' : ' . $a['tag']); } foreach ($this->people as $p) { if (!in_array($p->id, $ids)) { $category = Doctrine::getTable('RelationshipCategory')->findOneByName('Position'); $relationship = LsDoctrineQuery::create()->from('Relationship r')->where('r.entity1_id = ?', $p->id)->addWhere('r.entity2_id = ?', $this->corp->id)->addWhere('r.category_id = ?', $category->id)->addWhere('r.description1 = ?', 'Director')->fetchOne(); if ($relationship) { $relationship->is_current = 0; $relationship->save(); } } } if (count($age_set) < 0.5 * count($this->people)) { $this->printDebug('not enough names in age set:' . count($age_set) . ' vs. ' . count($this->people)); return null; } //figure out which tags surround name/age pairs $tag_arr = array('<table' => array(), '<tr' => array(), '<td' => array(), '<div' => array(), '<br' => array(), '<p' => array()); $tag_arr = array('table' => array(), 'tr' => array(), 'td' => array(), 'div' => array(), 'br' => array(), 'p' => array()); for ($i = 1; $i < count($age_set) - 1; $i++) { $str = substr($this->text, $age_set[$i - 1]['name_match']['pos'], $age_set[$i + 1]['name_match']['pos'] - $age_set[$i - 1]['name_match']['pos']); //$this->printDebug($str); foreach ($tag_arr as $tag => &$arr) { $tag_str = LsHtml::getStringInTag($str, $tag, $age_set[$i]['name_match']['pos'] - $age_set[$i - 1]['name_match']['pos']); if (strlen($tag_str) > 0) { $arr[] = strlen($tag_str); //$this->printDebug($tag_str); //echo "\n*****\n"; } } } arsort($tag_arr); //var_dump($tag_arr); //$this->printDebug(count($this->people)); if (count(reset($tag_arr)) == 0) { $this->printDebug('problems with enclosing tag detection'); return null; } foreach ($tag_arr as $tag => $arr) { $avg = array_sum($arr) / count($arr); $splitter = $tag; break; } $tag_counts = array(); for ($i = 0; $i < count($age_set) - 1; $i++) { $str = substr($this->text, $age_set[$i]['name_match']['pos'], $age_set[$i + 1]['name_match']['pos'] - $age_set[$i]['name_match']['pos']); str_ireplace('<' . $splitter, ' ', $str, $count); $tag_counts[] = $count; } sort($tag_counts); $ct = $tag_counts[0]; if (!$ct) { return null; } $post_strlen = 0; $info_arr = array(); for ($i = 0; $i < count($age_set); $i++) { $a = $age_set[$i]; $matches = LsString::striposMulti($this->text, '</' . $splitter, $ct, $a['name_match']['pos']); $end = $matches[count($matches) - 1]; $start = strripos(substr($this->text, 0, $a['name_match']['pos']), '<' . $splitter); $str = substr($this->text, $start, $end - $start); if ($i == count($age_set) - 1 && count($matches) > 1) { $end = $matches[count($matches) - 2]; $str2 = substr($this->text, $start, $end - $start); $avg = strlen(implode(' ', $segments)) / count($segments); if (abs(strlen($str2) - $avg) < abs(strlen($str) - $avg)) { $str = $str2; } } $segments[] = $str; //$this->printDebug($str); $info = $this->parseSegment($str, $a['name_match']['pos'] - $start, $a['name_match']['pos'] - $start + strlen($a['name_match']['match'][2][0])); $info = $this->parseBlurb($info, $a); //looks to see if bio appears aftr the parsed segment if ($i < count($age_set) - 1) { $next_start = strripos(substr($this->text, 0, $age_set[$i + 1]['name_match']['pos']), '<' . $splitter); $post_str = substr($this->text, $end, $next_start - $end); } else { $avg = $post_strlen / (count($age_set) - 1); $post_str = substr($this->text, $end, $avg); } $post_strlen += strlen($post_str); $post_str = LsHtml::replaceFontStyleTags($post_str); $person = $a['name_match']['person']; $last = LsString::escapeStringForRegex($person->name_last); $info['post_blurb'] = ''; if (preg_match_all('/>([^<]*' . $last . '[^<]*)</isu', $post_str, $matches)) { $post_blurb = implode(' ', $matches[1]); $post_blurb = trim(preg_replace('/\\s+/s', ' ', $post_blurb)); if (strlen($post_blurb) > 40) { $info['post_blurb'] = $post_blurb; } } $info_arr[] = $info; //echo "\n\n***\n\n"; } $ct = 0; $unv_ct = 0; foreach ($info_arr as $info) { if (strlen($info['post_blurb']) > strlen($info['blurb'])) { $ct++; } if ($info['img'] == null && $info['unverified_img'] != null) { $unv_ct++; } } //if most of the profile segments have images at the end, check to see if they belong to the next profile segment if ($unv_ct > count($age_set) - 3) { for ($i = 0; $i < count($age_set); $i++) { $len = strripos(substr($this->text, 0, $age_set[$i]['name_match']['pos']), '<' . $splitter); $tag_start = strripos(substr($this->text, 0, $len), '<img'); $str = substr($this->text, $tag_start, 200); if (preg_match('/^<img[^>]+src=[\'"]([^\'"]+)[\'"]/is', $str, $match) == 1) { $info['img'] = $match[1]; } else { if ($i == 0) { break; } } } } for ($i = 0; $i < count($info_arr); $i++) { if ($ct > 0.8 * count($age_set)) { $info_arr[$i]['blurb'] = $info_arr[$i]['post_blurb']; if (!$info_arr[$i]['since']) { $info_arr[$i]['since'] = $this->getStartDate($info_arr[$i]['blurb']); } } $this->importDirectorInfo($info_arr[$i], $age_set[$i]); $this->printDebug("\n***"); } //$this->printDebug($splitter); //var_dump($tag_counts); }
static function getHtmlPersonNames($text) { $name_matches = array(); $re = '/>\\s*\\p{Lu}\'?(\\p{L}+|\\.)?\\s+\\p{Lu}\\.?\\s+\\p{Lu}\\p{L}+(\\,?\\s+\\p{Lu}\\p{L}{1,4}\\.?)?/su'; $re2 = '/>\\s*(\\p{Lu}\'?(\\p{L}+|\\.)?\\s+(\\p{Lu}\'?(\\s+|\\p{L}+\\s+|\\.\\s*)?){0,2}\\p{Lu}\'?\\p{L}+(\\-\\p{Lu}\'?\\p{L}+)?(\\,?\\s+\\p{Lu}\\p{L}{1,4}\\.?)?)\\**\\s*</su'; $re3 = '/>\\s*(\\p{Lu}\'?\\p{L}+(\\-\\p{Lu}\'?\\p{L}+)?\\,\\s+(\\p{Lu}\'?(\\p{L}+|\\.)?(\\s+(\\p{Lu}\'?(\\s+|\\p{L}+\\s+|\\.\\s*)?){0,2})?)(\\,?\\s+\\p{Lu}\\p{L}{1,4}\\.?)?)\\**\\s*</su'; $text = LsHtml::replaceEntities($text); $name_matches = array(); if (preg_match_all($re2, $text, $matches, PREG_OFFSET_CAPTURE)) { //LOOP THROUGH MATCHES TO CONFIRM NAMES for ($i = 0; $i < count($matches[1]); $i++) { $m = $matches[1][$i]; //echo $m[0] . "\n"; $is_name = false; if (preg_match('/\\s+\\p{Lu}\\.?\\s/', $m[0])) { //echo ' * initial' . "\n"; $is_name = true; } $parts = LsString::split(trim($m[0])); //ADD NAME TO MATCH LIST IF IT FITS CONDITIONS if (in_array($parts[0], LsLanguage::$commonFirstNames)) { //echo ' * first name' . "\n"; $is_name = true; } $q = LsDoctrineQuery::create()->from('Person p')->where('p.name_first = ?', $parts[0]); if ($q->count() > 0) { //echo ' LS name' . "\n"; $is_name = true; } if ($is_name) { $name_matches[] = $m[0]; } /* if ($i != 0) { $beg = $matches[1][$i-1][1]; $tweenstr = substr($text,$beg, $m[1] - $beg); //echo ' tag count: ' . LsHtml::tagCount($tweenstr) . "\n"; } preg_match('/^[^\s]+\s/su',trim($m[0]),$match); $tags = LsHtml::getSurroundingTags($text,$m[1],3);*/ } } if (preg_match_all($re3, $text, $matches, PREG_OFFSET_CAPTURE)) { for ($i = 0; $i < count($matches[1]); $i++) { $m = $matches[1][$i]; //echo $m[0] . "\n"; $person = PersonTable::parseCommaName($m[0]); $name_matches[] = $person->getFullName(false); } } return $name_matches; }
public function checkBoardPage($board_rels) { $goog = new LsGoogle(); $goog->setQuery($this->entity->name . " board"); $results = $goog->execute(); $results = $goog->parseSearchResults($results); if (count($results)) { $url = $results[0]['unescapedUrl']; if (stripos($url, "yahoo.com")) { $url = $results[1]['unescapedUrl']; } $this->printDebug($url); try { if (!$this->browser->get($url)->responseIsError()) { $text = $this->browser->getResponseText(); $text = LsHtml::replaceEntities($text); $status_arr = array(); $unique_arr = array(); foreach ($board_rels as $br) { $found = 0; //$this->printDebug($br->Entity1->getNameRegex()); $regexes = $br->Entity1->getNameRegexes(); foreach ($regexes as $regex) { if (preg_match_all($regex, $text, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)) { //var_dump($matches); $found = 1; if (!in_array($br->entity1_id, $unique_arr)) { $unique_arr[] = $br->entity1_id; } break; } } $status_arr[] = $found; $this->printDebug($br->Entity1->name . " > " . $found); } if (count($unique_arr) > 1) { $this->printDebug("\tenough board member names found to mark as current or not"); for ($i = 0; $i < count($status_arr); $i++) { $br = $board_rels[$i]; if ($status_arr[$i] == 1) { $br->is_current = 1; } else { $br->is_current = 0; } if (!$this->testMode) { $br->save(); $br->addReference($url, null, null, $this->entity->name . ' board', null, null); } } return 1; } else { return 0; } } } catch (Exception $e) { return 0; } } else { return -1; } }
public function getCleanFirstParagraph() { if ($this->_paragraphs) { return null; } $first = $this->_paragraphs[0]; $first = LsString::spacesToSpace(LsHtml::replaceEntities(LsHtml::stripTags($first))); return $first; }
private function getProxyData($roster, $url, $proxy_year) { echo "fetching data from proxy at {$url} \n\n"; $people_count = 0; if (!$this->browser->get($url)->responseIsError()) { $this->proxyText = $this->browser->getResponseText(); $this->proxyText = LsHtml::replaceEntities($this->proxyText, ENT_QUOTES, 'UTF-8'); $this->proxyText = LsString::utf8TransUnaccent($this->proxyText); foreach ($roster as &$r) { //make sure this is not form 4 data for a corporation, continue to the next if it is if ($r['officerTitle'] == '' && $r['isDirector'] != 1 && strtoupper($r['isDirector']) != strtoupper('true')) { continue; } //echo $re; $parts = preg_split("/[\\s|\\.]+/", $r['personName'], -1, PREG_SPLIT_NO_EMPTY); //first word, but has to be part of last name because form4 names are in format RUBIN ROBERT E $last = trim($parts[0]); //sometimes O'LEARY can appear as O LEARY in the form 4 if (strlen($last) == 1) { $r['personName'] = $last . substr($r['personName'], 2); $parts = preg_split("/[\\s|\\.]+/", $r['personName'], -1, PREG_SPLIT_NO_EMPTY); $last = trim($parts[0]); } //prepare regex to match occurrences of full name //case insensitive to accommodate for various irregularities in names $re = LsLanguage::buildLooseNameRegex($r['personName']); $offset = 0; $found = true; //use stripos (much faster than regex) to find occurrences of the first word in the form 4 name (assumed to be part of the last name) //needs to be case insensitive //continue searching for last name in proxy until a matching full name (proxyName) is found while (!isset($r['proxyName']) && $found !== false) { $found = stripos($this->proxyText, $last, $offset); //$this->printDebug('found at pos:' . $found); $offset = $found + 1; if ($found !== false) { $str = substr($this->proxyText, $found - 70, 120); //$this->printDebug('found string: ' . $str); //$this->printDebug($re); preg_match_all($re, $str, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER); //$this->printDebug('matchcount is ' . count($matches)); foreach ($matches as $match) { if (stristr($match[1][0], '=')) { continue; } //since we may or may not be working with the full last name, use getLastName to return full last name $new_last = $this->getLastName($r['personName'], $match[1][0]); if ($new_last) { //if last name produced by case insensitive search has no capital letters, not a match if (preg_match('/\\p{Lu}/su', $new_last) == 0) { continue; } //now that we have a last name, pull the full name from the string $name = LsLanguage::getNameWithLast($match[0][0], $new_last); if ($name) { $parts = preg_split('/\\s+/isu', $name['nameStart'], -1, PREG_SPLIT_NO_EMPTY); $non_prefixes = array_diff($parts, PersonTable::$nameParsePrefixes); //if all we've found are matching prefixes, not a match if (count($non_prefixes) == 0) { continue; } else { $name1_parts = preg_split('/\\s+/', $r['personName'], -1, PREG_SPLIT_NO_EMPTY); $ct = 0; //compatibility check to correct for vagueness of regex foreach ($non_prefixes as $n) { foreach ($name1_parts as $p) { if (stripos($n, $p) === 0 || stripos($p, $n) === 0) { $ct++; } } } //phew -- if name is (somewhat) compatible, assume we've found it if ($ct > 0) { $r['proxyUrl'] = $url; $r['proxyYear'] = $proxy_year; $r['nameLast'] = trim(LsString::spacesToSpace($name['nameLast'])); $r['proxyName'] = trim(LsString::spacesToSpace($name['nameFull'])); } } } } } } } } unset($r); } else { //Error response (eg. 404, 500, etc) $log = fopen($this->logFile, 'a'); fwrite($log, "Couldn't get " . $url . "\n"); fclose($log); } return $roster; }
public function executeAddBulk($request) { $this->checkEntity($request, false, false); $this->reference_form = new ReferenceForm(); $this->reference_form->setSelectObject($this->entity); $this->add_bulk_form = new AddBulkForm(); //get possible default categories $this->categories = LsDoctrineQuery::create()->select('c.name, c.name')->from('RelationshipCategory c')->orderBy('c.id')->fetchAll(PDO::FETCH_KEY_PAIR); array_unshift($this->categories, ''); if ($request->isMethod('post') && in_array($request->getParameter('commit'), array('Begin', 'Continue'))) { if ($request->hasParameter('ref_id')) { $this->ref_id = $request->getParameter('ref_id'); } else { $refParams = $request->getParameter('reference'); $this->reference_form->bind($refParams); $restOfParams = (array) $request->getParameterHolder(); $restOfParams = array_shift($restOfParams); $this->add_bulk_form->bind($restOfParams, $request->getFiles()); if (!$this->reference_form->isValid() || !$this->add_bulk_form->isValid()) { return; } if ($this->ref_id = $refParams['existing_source']) { $ref = Doctrine::getTable('Reference')->find($this->ref_id); $url = $ref->source; } else { $ref = new Reference(); $ref->object_model = 'Entity'; $ref->object_id = $this->entity->id; $ref->source = $refParams['source']; $ref->name = $refParams['name']; $ref->source_detail = $refParams['source_detail']; $ref->publication_date = $refParams['publication_date']; $ref->save(); } $this->ref_id = $ref->id; $this->reference = $ref; } $verify_method = $request->getParameter('verify_method'); if ($this->add_method = $request->getParameter('add_method')) { if ($this->add_method == 'scrape') { //scrape ref url //set names to confirm $browser = new sfWebBrowser(); $entity_types = $request->getParameter('entity_types'); //FIND NAMES AT URL USING COMBO OF OPENCALAIS & LS CUSTOM HTML PARSING if (!$browser->get($ref->source)->responseIsError()) { $text = $browser->getResponseText(); $this->names = LsTextAnalysis::getHtmlEntityNames($text, $entity_types); $text = LsHtml::findParagraphs($text); $this->text = preg_replace('/<[^b][^>]*>/is', " ", $text); $this->confirm_names = true; return; } else { $request->setError('csv', 'problems finding names at that url'); } } else { if ($this->add_method == 'upload') { $file = $this->add_bulk_form->getValue('file'); $filename = 'uploaded_' . sha1($file->getOriginalName()); $extension = $file->getExtension($file->getOriginalExtension()); $filePath = sfConfig::get('sf_temp_dir') . '/' . $filename . $extension; $file->save($filePath); if ($filePath) { if ($spreadsheetArr = LsSpreadsheet::parse($filePath)) { $names = $spreadsheetArr['rows']; if (!in_array('name', $spreadsheetArr['headers'])) { $request->setError('file', 'The file you uploaded could not be parsed properly because there is no "name" column.'); return; } if (in_array('summary', $spreadsheetArr['headers'])) { foreach ($names as &$name) { $name['summary'] = str_replace(array('?', "'"), "'", $name['summary']); $name['summary'] = str_replace(array('?', '?', '"'), '"', $name['summary']); if (isset($name['title'])) { $name['description1'] = $name['title']; } } unset($name); } } else { $request->setError('file', 'The file you uploaded could not be parsed properly.'); return; } } else { $request->setError('file', 'You need to upload a file.'); return; } } else { if ($this->add_method == 'summary') { //parse summary for names $this->text = $this->entity->summary; $entity_types = $request->getParameter('entity_types'); $this->names = LsTextAnalysis::getTextEntityNames($this->text, $entity_types); $this->confirm_names = true; return; } else { if ($this->add_method == 'text') { $manual_names = $request->getParameter('manual_names'); if ($manual_names && $manual_names != "") { $manual_names = preg_split('#[\\r\\n]+#', $manual_names); $manual_names = array_map('trim', $manual_names); $names = array(); foreach ($manual_names as $name) { $names[] = array('name' => $name); } } else { $request->setError('csv', 'You did not add names properly.'); return; } } else { if ($this->add_method == 'db_search') { $this->db_search = true; } } } } } } //intermediate scrape page -- takes confirmed names, builds names arr if ($confirmed_names = $request->getParameter('confirmed_names')) { $restOfParams = (array) $request->getParameterHolder(); $restOfParams = array_shift($restOfParams); $this->add_bulk_form->bind($restOfParams, $request->getFiles()); if (!$this->add_bulk_form->isValid()) { $this->reference = Doctrine::getTable('reference')->find($this->ref_id); $this->names = unserialize(stripslashes($request->getParameter('names'))); $this->confirm_names = true; return; } $names = array(); foreach ($confirmed_names as $cn) { $names[] = array('name' => $cn); } $manual_names = $request->getParameter('manual_names'); if ($manual_names && $manual_names != "") { $manual_names = preg_split('#[\\r\\n]+#', $manual_names); $manual_names = array_map('trim', $manual_names); foreach ($manual_names as $name) { $names[] = array('name' => $name); } } } // LOAD IN RELATIONSHIP DEFAULTS if (isset($verify_method)) { $defaults = $request->getParameter('relationship'); if ($verify_method == 'enmasse') { $this->default_type = $request->getParameter('default_type'); $this->order = $request->getParameter('order'); $category_name = $request->getParameter('relationship_category_all'); $this->extensions = ExtensionDefinitionTable::getByTier(2, $this->default_type); $extensions_arr = array(); foreach ($this->extensions as $ext) { $extensions_arr[] = $ext->name; } } else { $category_name = $request->getParameter('relationship_category_one'); } if ($category_name) { $this->category_name = $category_name; if (!($category = Doctrine::getTable('RelationshipCategory')->findOneByName($category_name))) { $request->setError('csv', 'You did not select a relationship category.'); return; } $formClass = $category_name . 'Form'; $categoryForm = new $formClass(new Relationship()); $categoryForm->setDefaults($defaults); $this->form_schema = $categoryForm->getFormFieldSchema(); if (in_array($category_name, array('Position', 'Education', 'Membership', 'Donation', 'Lobbying', 'Ownership'))) { $this->field_names = array('description1', 'start_date', 'end_date', 'is_current'); } else { $this->field_names = array('description1', 'description2', 'start_date', 'end_date', 'is_current'); } $extraFields = array('Position' => array('is_board', 'is_executive'), 'Education' => array('degree_id'), 'Donation' => array('amount'), 'Transaction' => array('amount'), 'Lobbying' => array('amount'), 'Ownership' => array('percent_stake', 'shares')); if (isset($extraFields[$category_name])) { $this->field_names = array_merge($this->field_names, $extraFields[$category_name]); } } $this->matches = array(); // BOOT TO TOOLBAR OR LOOK FOR MATCHES FOR ENMASSE ADD if (isset($names) && count($names) > 0 || isset($this->db_search)) { if ($verify_method == 'onebyone') { if (isset($category_name)) { $defaults['category'] = $category_name; } $toolbar_names = array(); foreach ($names as $name) { $toolbar_names[] = $name['name']; } $this->getUser()->setAttribute('toolbar_names', $toolbar_names); $this->getUser()->setAttribute('toolbar_entity', $this->entity->id); $this->getUser()->setAttribute('toolbar_defaults', $defaults); $this->getUser()->setAttribute('toolbar_ref', $this->ref_id); $this->redirect('relationship/toolbar'); } else { $this->category_name = $category_name; if (isset($this->db_search)) { $num = $request->getParameter('num', 10); $page = $request->getParameter('page', 1); $q = LsDoctrineQuery::create()->from('Entity e')->where('(e.summary rlike ? or e.blurb rlike ?)', array('[[:<:]]' . $this->entity->name . '[[:>:]]', '[[:<:]]' . $this->entity->name . '[[:>:]]')); foreach ($this->entity->Alias as $alias) { $q->orWhere('(e.summary rlike ? or e.blurb rlike ?)', array('[[:<:]]' . $alias->name . '[[:>:]]', '[[:<:]]' . $alias->name . '[[:>:]]')); } $q->setHydrationMode(Doctrine::HYDRATE_ARRAY); $cat_id = constant('RelationshipTable::' . strtoupper($category_name) . '_CATEGORY'); $q->whereParenWrap(); $q->andWhere('NOT EXISTS (SELECT DISTINCT l.relationship_id FROM Link l ' . 'WHERE l.entity1_id = e.id AND l.entity2_id = ? AND l.category_id = ?)', array($this->entity['id'], $cat_id)); $summary_matches = $q->execute(); foreach ($summary_matches as $summary_match) { $aliases = array(); foreach ($this->entity->Alias as $alias) { $aliases[] = LsString::escapeStringForRegex($alias->name); } $aliases = implode("|", $aliases); $summary_match['summary'] = preg_replace('/(' . $aliases . ')/is', '<strong>$1</strong>', $summary_match['summary']); $this->matches[] = array('search_results' => array($summary_match)); } } else { for ($i = 0; $i < count($names); $i++) { if (isset($names[$i]['name']) && trim($names[$i]['name']) != '') { $name = $names[$i]['name']; $name_terms = $name; if ($this->default_type == 'Person') { $name_parts = preg_split('/\\s+/', $name); if (count($name_parts) > 1) { $name_terms = PersonTable::nameSearch($name); } $terms = $name_terms; $primary_ext = "Person"; } else { if ($this->default_type == 'Org') { $name_terms = OrgTable::nameSearch($name); $terms = $name_terms; $primary_ext = "Org"; } else { $terms = $name_terms; $primary_ext = null; } } $pager = EntityTable::getSphinxPager($terms, $page = 1, $num = 20, $listIds = null, $aliases = true, $primary_ext); $match = $names[$i]; $match['search_results'] = $pager->execute(); if (isset($names[$i]['types'])) { $types = explode(',', $names[$i]['types']); $types = array_map('trim', $types); $match['types'] = array(); foreach ($types as $type) { if (in_array($type, $extensions_arr)) { $match['types'][] = $type; } } } $this->matches[] = $match; } } } } } } } else { if ($page = $this->getRequestParameter('page')) { $this->page = $page; $this->num = $this->getRequestParameter('num', 50); } else { if ($request->isMethod('post') && $request->getParameter('commit') == 'Submit') { $this->ref_id = $this->getRequestParameter('ref_id'); $entity_ids = array(); $relationship_category = $this->getRequestParameter('category_name'); $order = $this->getRequestParameter('order'); $default_type = $request->getParameter('default_type'); $default_ref = Doctrine::getTable('Reference')->find($request->getParameter('ref_id')); for ($i = 0; $i < $this->getRequestParameter('count'); $i++) { if ($entity_id = $request->getParameter('entity_' . $i)) { $selected_entity_id = null; $relParams = $request->getParameter("relationship_" . $i); if ($relParams['ref_name']) { $ref['source'] = $relParams['ref_source']; $ref['name'] = $relParams['ref_name']; } if ($entity_id == 'new') { $name = $request->getParameter('new_name_' . $i); if ($default_type == 'Person') { $new_entity = PersonTable::parseFlatName($name); } else { $new_entity = new Entity(); $new_entity->addExtension('Org'); $new_entity->name = trim($name); } $new_entity->save(); $new_entity->blurb = $request->getParameter('new_blurb_' . $i); $new_entity->summary = $request->getParameter('new_summary_' . $i); if (!$ref) { $ref = $default_ref; } $new_entity->addReference($ref['source'], null, null, $ref['name']); if ($types = $request->getParameter('new_extensions_' . $i)) { foreach ($types as $type) { $new_entity->addExtension($type); } } $new_entity->save(); $selected_entity_id = $new_entity->id; } else { if ($entity_id > 0) { $selected_entity_id = $entity_id; LsCache::clearEntityCacheById($selected_entity_id); } } if ($selected_entity_id) { $startDate = $relParams['start_date']; $endDate = $relParams['end_date']; unset($relParams['start_date'], $relParams['end_date'], $relParams['ref_name'], $relParams['ref_url']); $rel = new Relationship(); $rel->setCategory($relationship_category); if ($order == '1') { $rel->entity1_id = $this->entity['id']; $rel->entity2_id = $selected_entity_id; } else { $rel->entity2_id = $this->entity['id']; $rel->entity1_id = $selected_entity_id; } //only set dates if valid if ($startDate && preg_match('#^\\d{4}-\\d{2}-\\d{2}$#', Dateable::convertForDb($startDate))) { $rel->start_date = Dateable::convertForDb($startDate); } if ($endDate && preg_match('#^\\d{4}-\\d{2}-\\d{2}$#', Dateable::convertForDb($endDate))) { $rel->end_date = Dateable::convertForDb($endDate); } $rel->fromArray($relParams, null, $hydrateCategory = true); if ($request->hasParameter('add_method') && $request->getParameter('add_method') == 'db_search') { $refs = EntityTable::getSummaryReferences($selected_entity_id); if (count($refs)) { $ref = $refs[0]; } else { $refs = EntityTable::getAllReferencesById($selected_entity_id); if (count($refs)) { $ref = $refs[0]; } } } if (!$ref) { $ref = $default_ref; } $rel->saveWithRequiredReference(array('source' => $ref['source'], 'name' => $ref['name'])); $ref = null; } } } $this->clearCache($this->entity); $this->redirect($this->entity->getInternalUrl()); } else { if ($request->isMethod('post') && $request->getParameter('commit') == 'Cancel') { $this->redirect($this->entity->getInternalUrl()); } } } } }