public static function getHtmlEntityNames($text, $entity_types) { $ls_names = LsLanguage::getHtmlPersonNames($text); //var_dump($ls_names); $oc = new LsOpencalais(); $oc->setParameter(array('contentType' => 'text/html')); $oc->setContent($text); $oc->execute(); $response = $oc->getParsedResponse(array("Person", "Company", "Organization")); if ($entity_types == 'all') { $oc_names = array_merge((array) $response['Person'], (array) $response['Company'], (array) $response['Organization']); $names = array_merge($oc_names, $ls_names); } else { if ($entity_types == 'people') { $oc_names = array_merge((array) $response['Person']); $names = array_merge($oc_names, $ls_names); } else { if ($entity_types == 'orgs') { $names = array_merge((array) $response['Company'], (array) $response['Organization']); } } } return $names; }
?> <?php if (RelationshipTable::areSameDescriptions($relationship)) { ?> <?php echo entity_link($relationship['Entity1']); ?> and <?php echo entity_link($relationship['Entity2']); ?> <?php echo $current === NULL ? "are/were" : ($current == '1' ? "are" : "were"); ?> <?php echo LsLanguage::pluralize($relationship['description1']); } else { ?> <?php echo entity_link($relationship['Entity1']); ?> and <?php echo entity_link($relationship['Entity2']); ?> <?php echo $current === NULL ? "have/had" : ($current == '1' ? "have" : "had"); ?> a generic relationship <?php } ?>
</tr> <?php foreach ($stats as $label => $count) { ?> <tr class="text_big"> <td style="text-align: right; padding-right: .7em;"> <strong><?php echo format_number($count); ?> </strong> </td> <td> new <?php echo LsLanguage::pluralize($label); ?> </td> </tr> <?php } ?> </table> </div> <?php } ?> <?php cache_save();
function import(Entity $person, $possible_persons) { //loop through the people we found. usually just one. foreach ($possible_persons as $possible_person) { $this->printDebug('Query returned ' . count($possible_person) . ' person named ' . $possible_person->name); //this person does not provide education. we skip if (count($possible_person->education)) { $this->printDebug('Education found'); } else { $this->printDebug('No education history found'); continue; } //get employement info for this possible match $possible_person_bio = $possible_person->summary; if (count($possible_person->employment_history)) { foreach ($possible_person->employment_history as $employment) { $possible_person_bio .= ' ' . $employment->company . " "; } $this->printDebug('Employment found'); } else { $this->printDebug('No employment history found'); continue; } //get employment info for the person in our database $relationship_orgs = $person->getRelatedEntitiesQuery('Org', RelationshipTable::POSITION_CATEGORY, null, null, null, false, 1)->execute(); $person_bio = $person->summary; foreach ($relationship_orgs as $org) { $person_bio .= ' ' . $org->name; } //lets see how many matches we get $matches = LsLanguage::getCommonPronouns($person_bio, trim($possible_person_bio), LsLanguage::$business); if (count($matches)) { foreach ($possible_person->education as $school) { $school->institution = mb_convert_encoding($school->institution, 'UTF-8'); $school->institution = preg_replace('//isu', ' ', $school->institution); $this->printDebug('Looking for the school: ' . $school->institution); $current_school = EntityTable::findByAlias($school->institution, $context = 'bw_school'); //find school if ($current_school) { $this->printDebug('Found school'); } else { $current_school = EntityTable::getByExtensionQuery(array('Org', 'School'))->addWhere('LOWER(org.name) LIKE ?', '%' . strtolower($school->institution) . "%")->fetchOne(); if (!$current_school) { $new_school = new Entity(); $new_school->addExtension('Org'); $new_school->addExtension('School'); $new_school->name = $school->institution; $wikipedia = new LsWikipedia(); $wikipedia->request($school->institution); if ($wikipedia->execute() && !$wikipedia->isDisambiguation()) { $info_box = $wikipedia->getInfoBox(); if (isset($info_box['students']) && preg_match('/([\\d\\,]{2,})/isu', $info_box['students']['clean'], $match)) { $new_school->students = LsNumber::clean($match[1]); } else { $student_types = array('undergrad', 'postgrad', 'grad', 'doctoral'); $num_students = 0; foreach ($student_types as $st) { if (isset($info_box[$st]) && preg_match('/([\\d\\,]{2,})/isu', $info_box[$st]['clean'], $match)) { $num_students += LsNumber::clean($match[1]); } } if ($num_students > 0) { $new_school->students = $num_students; } } if (isset($info_box['faculty']) && preg_match('/([\\d\\,]{2,})/isu', $info_box['faculty']['clean'], $match)) { $new_school->faculty = LsNumber::clean($match[1]); } if (isset($info_box['type'])) { if (stristr($info_box['type']['clean'], 'public')) { $new_school->is_private = 0; } else { if (stristr($info_box['type']['clean'], 'private')) { $new_school->is_private = 1; } } } if (isset($info_box['endowment'])) { if (preg_match('/(\\$[\\d\\,\\.\\s]+)(million|billion)/isu', $info_box['endowment']['clean'], $match)) { if (strtolower($match[2]) == 'billion') { $factor = 1000000000; } else { $factor = 1000000; } $new_school->endowment = LsNumber::formatDollarAmountAsNumber($match[1], $factor); } } if (isset($info_box['established'])) { $year = null; if ($date = LsDate::convertDate($info_box['established']['clean'])) { $new_school->start_date = $date; } else { if (preg_match('/\\b(\\d\\d\\d\\d)\\b/isu', $info_box['established']['clean'], $match)) { $new_school->start_date = $match[1]; } } } $summary = trim($wikipedia->getIntroduction()); $summary = preg_replace('/\\n\\s*\\n/isu', '', $summary); if (strlen($summary) > 10) { $new_school->summary = $summary; } $new_school->save(); $new_school->addReference($source = $wikipedia->getUrl(), $excerpt = null, $fields = array('summary'), $name = 'Wikipedia'); } else { $new_school->save(); } $current_school = $new_school; $this->printDebug('Adding new school'); } $alias = new Alias(); $alias->name = $school->institution; $alias->context = 'bw_school'; $alias->Entity = $current_school; $alias->save(); } //find degree $degree = null; if (!($degree = DegreeTable::getByText($school->degree))) { $degree = DegreeTable::addDegree($school->degree); $this->printDebug('Adding new degree'); } //find relationship $relationship = null; $relationships = $person->getRelationshipsWithQuery($current_school, RelationshipTable::EDUCATION_CATEGORY)->execute(); foreach ($relationships as $existing_relationship) { if ($existing_relationship->degree_id == $degree->id) { $relationship = $existing_relationship; break; } } if ($relationship) { $this->printDebug('Relationship between person and school exists'); } else { $relationship = new Relationship(); $relationship->Entity1 = $person; $relationship->Entity2 = $current_school; $relationship->description1 = 'student'; $relationship->is_current = 0; if ($school->year) { $relationship->end_date = $school->year; } $relationship->setCategory('Education'); $this->printDebug('Creating new relationship between person and school'); } //save $relationship->save(); //add degree and reference if ($relationship->degree_id == null) { $reference_name = strstr($school->source, 'wikipedia') ? "Wikipedia" : "BusinessWeek"; $relationship->Degree = $degree; $relationship->save(); $relationship->addReference($source = $school->source, $excerpt = null, $fields = array('degree_id'), $name = $reference_name, $detail = null, $date = null); $this->printDebug('Adding degree and reference'); } } } else { $this->printDebug('No organization matches'); return false; } } return true; }
static function parseBioguideName($str) { $entity = new Entity(); $entity->addExtension('Person'); //extract nickname if (preg_match('/\\(([^(]+)\\)/', $str, $nick)) { $entity->name_nick = $nick[1]; $str = preg_replace('/\\(.*\\)/U', '', $str); } $str = preg_replace('/\\s{2,}/', ' ', $str); $str = str_replace('.', '', $str); $parts = explode(',', trim($str)); if (count($parts) > 1) { $entity->name_last = LsLanguage::nameize(mb_strtolower(trim($parts[0]), mb_detect_encoding(trim($parts[0])))); $other = explode(' ', trim($parts[1])); $entity->name_first = trim($other[0]); if (count($other) > 1) { $middles = array_slice($other, 1); $middle = trim(implode($middles, ' ')); $entity->name_middle = $middle; } if (count($parts) > 2) { $suffix = trim($parts[2]); $entity->name_suffix = $suffix; } } else { return null; } return $entity; }
protected function import($url) { $company = null; if (!$this->browser->get($url)->responseIsError()) { $text = $this->browser->getResponseText(); $rank = null; $name = null; $industryName = null; $street1 = null; $street2 = null; $city = null; $state = null; $postal = null; $phone = null; $fax = null; $website = null; $blurb = null; $summary = null; $revenue = null; $employees = null; $ceoName = null; $ceoBirthYear = null; //get rank if ($this->year > 1999 && $this->year < 2005 && preg_match('/ForbesListRank" content="(\\d+)"/i', $text, $match)) { $rank = $match[1]; } elseif ($this->year < 2000 && preg_match('/td class="highlightcolor1">(\\d+)/i', $text, $match)) { $rank = $match[1]; } elseif ($this->year > 2004 && preg_match('/<b>#(\\d+) ([^<]+)<\\/b>/i', $text, $match)) { $rank = html_entity_decode($match[1]); } //get name if ($this->year > 1995 && $this->year < 2005 && preg_match('/span class="mainlisttitle">([^<]+)<\\/span>/i', $text, $match)) { $name = html_entity_decode($match[1]); } elseif ($this->year > 2004 && preg_match('/<b>#(\\d+) ([^<]+)<\\/b>/i', $text, $match)) { $name = html_entity_decode($match[2]); } else { $this->printDebug("Company name not found"); return; } //get industry if ($this->year > 1995 && $this->year < 2001 && preg_match('/<b>See more private companies in <a [^>]+>([^<]+)<\\/a><\\/b>/ism', $text, $match)) { $industryName = trim(html_entity_decode($match[1])); } elseif ($this->year > 2000 && $this->year < 2005 && preg_match('/private companies\\<\\/a> in ([^\\.]+)/ism', $text, $match)) { $industryName = trim(html_entity_decode($match[1])); } elseif ($this->year > 2004 && preg_match('/<b>Industry:<\\/b> <a href="[^"]+">([^<]+)<\\/a>/ism', $text, $match)) { $industryName = trim(html_entity_decode($match[1])); } //get address if ($this->year > 1995 && $this->year < 2000 && preg_match('/<td class="mainlisttxt"\\>(.+)phone/smU', $text, $match)) { $contactLines = explode('<br>', trim($match[1])); array_pop($contactLines); $street1 = $contactLines[0]; $street2 = count($contactLines) == 3 ? $contactLines[2] : null; $city_state_zip = count($contactLines) == 3 ? LsLanguage::parseCityStatePostal($contactLines[2]) : LsLanguage::parseCityStatePostal($contactLines[1]); $city = $city_state_zip['city']; $state = $city_state_zip['state']; $postal = $city_state_zip['zip']; } elseif ($this->year > 1999 && $this->year < 2005 && preg_match('/(view private companies under this industry|in the same industry).+<br><br>(.+)phone/is', $text, $match)) { var_dump($match); $contactLines = explode('<br>', trim($match[1])); array_pop($contactLines); $street1 = $contactLines[0]; $street2 = count($contactLines) == 3 ? $contactLines[2] : null; $city_state_zip = count($contactLines) == 3 ? LsLanguage::parseCityStatePostal($contactLines[2]) : LsLanguage::parseCityStatePostal($contactLines[1]); $city = $city_state_zip['city']; $state = $city_state_zip['state']; $postal = $city_state_zip['zip']; } elseif ($this->year > 2004 && preg_match('/<div class="spaced">(.+)<\\/div>/ismU', $text, $match)) { $contactLines = explode('<br>', $match[1]); if (!preg_match('/Phone\\:|Fax\\:/i', $contactLines[0]) && !preg_match('/Phone\\:|Fax\\:/i', $contactLines[1])) { $street1 = trim($contactLines[0]); if (count($contactLines) == 4) { if (preg_match('/^(.+?) ([A-Z]{2}) (\\d{5})($|-)/sU', trim($contactLines[1]), $match)) { $city = $match[1]; $state = $match[2]; $postal = $match[3]; } } elseif (count($contactLines) == 5) { $street2 = $contactLines[1]; if (preg_match('/^(.+?) ([A-Z]{2}) (\\d{5})($|-)/sU', trim($contactLines[2]), $match)) { $city = $match[1]; $state = $match[2]; $postal = $match[3]; } } } } //get phone if ($this->year > 1995 && $this->year < 2005 && preg_match('/phone ([\\d\\-]{12})/is', $text, $match)) { $phone = trim(str_replace('-', '', $match[1])); } elseif ($this->year > 2004 && preg_match('/Phone: ([\\d\\-]{12})/is', $text, $match)) { $phone = trim(str_replace('-', '', $match[1])); } //get fax if ($this->year > 1995 && $this->year < 2005 && preg_match('/fax ([\\d\\-]{12})/is', $text, $match)) { $fax = trim(str_replace('-', '', $match[1])); } else { if ($this->year > 2004 && preg_match('/Fax: ([\\d\\-]{12})/is', $text, $match)) { $fax = trim(str_replace('-', '', $match[1])); } } //get website if ($this->year > 1995 && $this->year < 2005 && preg_match('/this company\'s web site[^>]+\\>(http[^\\<]+)/is', $text, $match)) { $website = $match[1]; } elseif ($this->year > 2004 && preg_match('/<div class="spaced">.*<\\/div>\\s+<br>\\s+<a href="(http:\\/\\/[^"]+)">/ismU', $text, $match)) { $website = $match[1]; } //get ceo if ($this->year > 1995 && $this->year < 2005 && preg_match('/b>CEO: ([^<]+)<\\/b>/ism', $text, $match)) { $ceoName = $match[1]; } elseif ($this->year > 2004 && preg_match('/CEO: ([^<]+)<\\/b> , (\\d+) <br>/ism', $text, $match)) { $ceoName = html_entity_decode($match[1]); $ceoBirthYear = date("Y"); -$match[2]; } //get summary if ($this->year > 1995 && $this->year < 2000 && preg_match_all('/p class="mainlisttxt">(.*)<\\/p>/ismU', $text, $match)) { $summary = str_replace(array(' ', "\n"), array(' ', ' '), html_entity_decode(trim(strip_tags($match[1][1])))); } elseif ($this->year > 1999 && $this->year < 2005 && preg_match('/p class="mainlisttxt">(.*)<\\/p>/ismU', $text, $match)) { $summary = str_replace(array(' ', "\n"), array(' ', ' '), html_entity_decode(trim(strip_tags($match[1])))); } elseif ($this->year > 2004 && preg_match('/<blockquote class="spaced">(.*)<\\/blockquote>/ismU', $text, $match)) { $summary = str_replace(array(' ', "\n"), array(' ', ' '), html_entity_decode(trim(strip_tags($match[1])))); } //get revenue if ($this->year > 1995 && $this->year < 2000 && preg_match('/<td class="mainlisttxt">\\$([\\S]+) mil<sup>e?<\\/sup><\\/td>/ismU', $text, $match)) { $this->printDebug($match[1]); $revenue = str_replace(",", "", $match[1] . ",000,000"); } elseif ($this->year > 1999 && $this->year < 2005 && preg_match('/<td class="mainlisttxt" nowrap>([^<]+)<sup>e?<\\/sup><\\/td>/ismU', $text, $match)) { $this->printDebug($match[1]); $revenue = str_replace(",", "", $match[1] . ",000,000"); } elseif ($this->year > 2004 && preg_match('/<td class="highlight" nowrap="nowrap">\\$([\\S]+) bil.*<\\/td> <td class="highlight" nowrap="nowrap">[^<]+<\\/td> <td class="highlight" nowrap="nowrap">([^<]+)<\\/td>/ismU', $text, $match)) { $revenue = 1000000000 * $match[1]; } //get employees if ($this->year > 1995 && $this->year < 2005 && preg_match('/mil<\\/td>.+<td class="mainlisttxt"( nowrap)?>(\\d[^<]+)<\\/td>.+<td class="mainlisttxt">[a-zA-Z]+<\\/td>/ismU', $text, $match)) { $employees = str_replace(',', '', $match[2]); } elseif ($this->year > 1999 && $this->year < 2005 && preg_match('/<sup>e?<\\/sup><\\/td> <td class="mainlisttxt"( nowrap)?>(\\d[^<]+)<sup>e?<\\/sup><\\/td> <td class="mainlisttxt">[a-zA-Z]+<\\/td>/ismU', $text, $match)) { $employees = str_replace(',', '', $match[2]); } elseif ($this->year > 2004 && preg_match('/<td class="highlight" nowrap="nowrap">([\\d,]+)<\\/td> <td class="highlight" nowrap="nowrap">[A-Z][a-z]{2,}<\\/td>/', $text, $match)) { $employees = str_replace(',', '', $match[1]); } /*$this->printDebug( "URL: ". $url); $this->printDebug( "Rank: " . $rank ); $this->printDebug( "Name: " . $name ); $this->printDebug( "Industry: " . $industryName ); $this->printDebug( "Street: " . $street1 ); $this->printDebug( "Street2: " . $street2 ); $this->printDebug( "City: " . $city ); $this->printDebug( "State: " . $state ); $this->printDebug( "Postal: " . $postal ); $this->printDebug( "Phone: " . $phone ); $this->printDebug( "Fax: " . $fax ); $this->printDebug( "Website: " . $website ); $this->printDebug( "CEO: " . $ceoName . " " . $ceoBirthYear); $this->printDebug( "Summary: " . $summary ); $this->printDebug( "Revenue: " . $revenue ); $this->printDebug( "Employees: " . $employees );*/ $search_company_name = trim(implode(' ', array_diff(explode(' ', ucwords(strtolower($name))), array_merge(LsLanguage::$business, LsLanguage::$businessAbbreviations)))); //continue; $this->printDebug("{$search_company_name} == {$name}"); if ($company = EntityTable::getByExtensionQuery(array('Org', 'PrivateCompany'))->addWhere("LOWER(REPLACE( org.name, '-' , '')) = ?", strtolower($name))->fetchOne()) { $this->printDebug("Company exists"); $company->revenue = $revenue; $company->save(); } else { $this->printDebug("Creating new company {$name}"); Doctrine::getTable('ExtensionDefinition')->clear(); $company = new Entity(); $company->addExtension('Org'); $company->addExtension('Business'); $company->addExtension('PrivateCompany'); $company->name = LsLanguage::titleize($name); $company->employees = strlen($employees) ? $employees : null; $company->revenue = strlen($revenue) ? $revenue : null; $company->website = strlen($website) ? $website : null; $company->summary = strlen($summary) ? trim($summary) : null; //add address if ($phone) { $company->addPhone($phone); } if ($fax) { //$company->addPhone($fax); } if ($city && $state) { $address = new Address(); $address->street1 = strlen($street1) ? $street1 : null; $address->street2 = strlen($street2) ? $street2 : null; $address->city = strlen($city) ? $city : null; if ($state = AddressStateTable::retrieveByText($state)) { $address->State = $state; } $address->postal = $postal; $company->addAddress($address); $address->save(); $address->addReference($source = $url, $excerpt = null, $fields = array('city', 'country_id', 'postal', 'state_id', 'street1'), $name = 'Forbes.com', $detail = null, $date = null); } } /*$this->printDebug( "URL: ". $url); $this->printDebug( "Rank: " . $rank ); $this->printDebug( "Name: " . $name ); $this->printDebug( "Industry: " . $industryName ); $this->printDebug( "Street: " . $street1 ); $this->printDebug( "Street2: " . $street2 ); $this->printDebug( "City: " . $city ); $this->printDebug( "State: " . $state ); $this->printDebug( "Postal: " . $postal ); $this->printDebug( "Phone: " . $phone ); $this->printDebug( "Fax: " . $fax ); $this->printDebug( "Website: " . $website ); $this->printDebug( "CEO: " . $ceoName . " " . $ceoBirthYear); $this->printDebug( "Summary: " . $summary ); $this->printDebug( "Revenue: " . $revenue ); $this->printDebug( "Employees: " . $employees );*/ $company->save(); $company->addReference($source = $url, $excerpt = null, $fields = array('website', 'name', 'website', 'summary', 'revenue', 'employees'), $name = 'Forbes.com', $detail = null, $date = null); $this->saveToList($company, $rank); } else { $this->printDebug("Couldn't get company: " . $url); } }
public function executeAddBoard($request) { $this->checkEntity($request, false, false); //FIND NAMES AT REF URL PROVIDED $this->reference_form = new ReferenceForm(); $this->reference_form->setSelectObject($this->entity); if ($request->isMethod('post')) { $commit = $request->getParameter('commit'); if ($commit == 'Cancel') { $this->redirect(EntityTable::getInternalUrl($this->entity)); } $this->lim = 5; // REFERENCE INFO HAS BEEN SUBMITTED, SO GO TO URL AND SCRAPE if (!$request->hasParameter('ref_id') && $request->hasParameter('reference')) { $this->getUser()->setAttribute('board_names', null); $refParams = $request->getParameter('reference'); $this->reference_form->bind($refParams); if ($this->reference_form->isValid()) { if ($this->ref_id = $refParams['existing_source']) { $ref = Doctrine::getTable('Reference')->find($this->ref_id); $url = $ref->source; } else { $ref = new Reference(); $ref->object_model = 'Entity'; $ref->object_id = $this->entity->id; $ref->source = $refParams['source']; $ref->name = $refParams['name']; $ref->source_detail = $refParams['source_detail']; $ref->publication_date = $refParams['publication_date']; $ref->save(); $this->ref_id = $ref->id; $url = $ref->source; } $browser = new sfWebBrowser(); //FIND NAMES AT URL USING COMBO OF OPENCALAIS & LS CUSTOM HTML PARSING if (!$browser->get($url)->responseIsError()) { $text = $browser->getResponseText(); $ls_names = LsLanguage::getHtmlPersonNames($text); $oc = new LsOpencalais(); $oc->setParameter(array('contentType' => 'text/html')); $oc->setContent($text); $oc->execute(); $response = $oc->getParsedResponse(array("Person")); $oc_names = (array) $response['Person']; $names = array_merge($oc_names, $ls_names); $names = array_unique($names); sort($names); $this->getUser()->setAttribute('board_names', $names); } } } else { if ($request->hasParameter('ref_id')) { $this->ref_id = $this->getRequestParameter('ref_id'); $entity_ids = array(); for ($i = 0; $i < $this->lim; $i++) { if ($entity_id = $request->getParameter('entity_' . $i)) { if ($entity_id == 'new') { $name = $request->getParameter('new_name_' . $i); $new_entity = PersonTable::parseFlatName($name); $new_entity->blurb = $request->getParameter('new_blurb_' . $i); if ($name && !$new_entity->name_last) { $request->setError('name', 'The name you entered is invalid'); } else { $new_entity->save(); $entity_ids[] = $new_entity->id; } } else { if ($entity_id > 0) { $entity_ids[] = $entity_id; } } } } $this->existing_rels = array(); $this->new_rels = array(); //CHECK FOR EXISTING RELATIONSHIPS, CREATE NEW IF NONE FOUND foreach ($entity_ids as $entity_id) { $existing_rel = LsDoctrineQuery::create()->from('Relationship r')->leftJoin('r.Position p')->where('r.entity1_id = ? and r.entity2_id = ? and p.is_board = ?', array($entity_id, $this->entity->id, '1'))->fetchOne(); if ($existing_rel) { $this->existing_rels[] = $existing_rel; } else { $rel = new Relationship(); $rel->entity1_id = $entity_id; $rel->entity2_id = $this->entity->id; $rel->setCategory('Position'); $rel->description1 = 'Board Member'; $rel->description2 = 'Board Member'; $rel->is_board = 1; $rel->is_employee = 0; $rel->saveWithRequiredReference(array('existing_source' => $this->ref_id, 'excerpt' => null, 'source_detail' => null, 'publication_date' => null)); $this->new_rels[] = $rel; } } } } } else { $this->getUser()->setAttribute('board_names', null); } // IF BOARD NAMES SESSION VARIABLE NOT NULL, PAGE THROUGH TO CORRECT START if ($board_names = $this->getUser()->getAttribute('board_names')) { $this->start = $this->getRequestParameter('start'); $this->matches = array(); if (count($board_names) > $this->start) { for ($i = $this->start; $i < $this->start + $this->lim; $i++) { if (!isset($board_names[$i])) { break; } $name = $board_names[$i]; $pager = EntityTable::getSphinxPager($terms, $page = 1, $num = 10, $listIds = null, $aliases = true, $primary_ext = "Person"); $this->matches[$name] = $pager->execute(); $this->total = $pager->getNumResults(); } } $this->total = count($board_names); $this->end = count($this->matches) < $this->lim ? $this->start + count($this->matches) : $this->start + $this->lim; } if ($this->hasRequestParameter('finished')) { $this->finished = 1; } }
?> <?php slot('header_actions', array('remove' => array('credential' => 'deleter', 'url' => 'tag/remove?name=' . $tag->getName(), 'options' => 'post=true confirm=Are you sure you want to remove this tag?'))); ?> <?php foreach ($models as $model) { ?> <?php $pager = eval('return $' . strtolower($model) . '_pager;'); ?> <?php include_partial('global/section', array('title' => LsLanguage::pluralize($model), 'pager' => $pager, 'more' => 'tag/objects?name=' . $tag->getName() . '&model=' . $model)); ?> <div class="padded"> <?php foreach ($pager->execute() as $object) { ?> <strong><?php echo link_to($object, strtolower($model) . '/view?id=' . $object->id); ?> </strong> <br /> <?php } ?> </div>
public function parseBio($bio = null) { if (!$bio) { $bio = $this->Entity->summary; } $name_matches = LsLanguage::getAllNames($bio); $names = array(); for ($i = 0; $i < count($name_matches); $i++) { $name = $name_matches[$i]; $arr = array('for\\s+the', 'of\\s+the', 'at\\s+the', 'at', 'of', 'the', 'for', 'and'); foreach ($arr as $a) { $splat = preg_split('/\\s+' . $a . '\\s+/isu', $name, -1, PREG_SPLIT_NO_EMPTY); if (count($splat) > 1) { if (!in_array($splat[0], LsLanguage::$commonPositions)) { $name_matches = array_merge($name_matches, $splat); } else { array_shift($splat); $a = str_replace('\\s+', ' ', $a); $name = implode(" {$a} ", $splat); } } } $splat = preg_split('/\'s\\s+/isu', $name, -1, PREG_SPLIT_NO_EMPTY); if (count($splat) > 1) { $name_matches = array_merge($name_matches, $splat); } } unset($name); $exclude = array_merge(LsLanguage::$regions, LsLanguage::$commonFirstNames, LsLanguage::$commonLastNames, LsLanguage::$states, LsLanguage::$commonCities, LsLanguage::$grammar, LsLanguage::$weekdays, LsLanguage::$months, LsLanguage::$geography, LsLanguage::$possessives, explode(' ', $this->Entity->name), array($this->Entity->name), LsLanguage::$schools, LsLanguage::$commonPositions); $names = array(); foreach ($name_matches as $name) { $new = str_replace("'s ", " ", $name); if ($new != $name) { $name_matches[] = $new; } $name = trim($name); $name = preg_replace('/[\\,\\.\'\\’]$/isu', '', $name); if (!in_array($name, $exclude)) { $names[] = $name; } //else $this->printDebug($name . ' rejected'); } $names = array_unique($names); $names = LsArray::strlenSort($names); /* $found_entities = array(); foreach($names as $name) { $entities = EntityTable::getByExtensionAndNameQuery(array('Person'),$name, $strict = 1)->execute(); if (count($entities)) { //$this->printDebug($name . ":"); foreach($entities as $e) { //$this->printDebug(' ' . $org->name); $found_entities[] = $e; } } else if (count(LsString::split($name)) > 1) { $possible_orgs = array(); $google_scraper = new LsGoogle; $google_scraper->setQuery(trim($name)); $google_scraper->execute(); if ($google_scraper->getNumResults()) { $results = $google_scraper->getResults(); foreach ($results as $result) { $title = LsHtml::stripTags($result->title); preg_match('/http\:\/\/[^\/]+\//isu',$result->unescapedUrl,$match); if (!$match) continue; $trimmed_url = $match[0]; $title_first = LsString::split($title); $title_first = array_shift($title_first); if (!stristr($title,'wikipedia') && (OrgTable::checkUrl($trimmed_url, $name) && preg_match('/^(The\s+)?' . LsString::escapeStringForRegex($title_first) . '/su',$name))) { $this->printDebug($name . ":"); $possible_orgs[] = $name; $this->printDebug(' ' . $title); //$this->printDebug(' ' . $result->unescapedUrl); //$this->printDebug(' ' . LsHtml::stripTags($result->content)); break; } } } //var_dump($possible_orgs); } }*/ //$this->printDebug(''); return $names; }
static function parseFlatName($str) { $namePrefix = $nameFirst = $nameMiddle = $nameLast = $nameSuffix = $nameNick = null; //trim and remove periods and commas $str = strip_tags($str); $name_in_reverse_order = false; if (strpos($str, ',')) { $name_in_reverse_order = true; } $name = LsLanguage::nameize(str_ireplace(LsLanguage::$punctuations, '', $str)); $nameArray = explode(" ", $name); foreach ($nameArray as $key => $part) { if ($name_in_reverse_order) { if ($key == 0) { $nameLast = $part; } if ($key == 1) { $nameFirst = $part; } } else { if ($key == 0) { $nameFirst = $part; } if ($key == 1) { $nameLast = $part; } } if (in_array($part, LsLanguage::$generationalSuffixes)) { $nameSuffix = $part; } //find nickname in quotes if (preg_match('/\'([\\S]+)\'|"([\\S]+)"/', $part, $nickFound)) { $nameNick = $nickFound[1] ? $nickFound[1] : $nickFound[2]; $str = trim(preg_replace('/\'([\\S]+)\'|"([\\S]+)"/', '', $str)); } if ($key == 2 and !in_array($part, LsLanguage::$commonPrefixes) and !in_array($part, LsLanguage::$generationalSuffixes)) { $nameMiddle = $part; } } //return person with name fields return array('name_prefix' => $namePrefix, 'name_first' => $nameFirst, 'name_middle' => $nameMiddle, 'name_last' => $nameLast, 'name_suffix' => $nameSuffix, 'name_nick' => $nameNick); }
static function nameizePersonName($name) { $parts = preg_split('/\\s+/', $name); $cleanParts = array(); foreach ($parts as $part) { if (strlen(str_replace('.', '', $part)) > 2) { $part = LsLanguage::nameize($part); } $cleanParts[] = $part; } return implode(' ', $parts); }
private function importAddress($address_arr, $person, $person_arr) { $a = new Address(); $a->street1 = LsLanguage::nameize($address_arr['street1']); $a->street2 = LsLanguage::nameize($address_arr['street2']); $a->city = $address_arr['city']; $a->Category = Doctrine::getTable('AddressCategory')->findOneByName('Mailing'); if ($state = AddressStateTable::retrieveByText($address_arr['state'])) { $a->State = $state; } else { return; } $a->postal = $address_arr['postal']; if (!$this->testMode) { if ($person->addAddress($a)) { $person->save(); $a->addReference($person_arr['readableXmlUrl'], null, null, $this->entity->name . ' ' . $person_arr['formName'], null, $person_arr['date']); } } }
protected function generatePerson($name_str, $summary = null, $orgs = null) { $name_arr = LsLanguage::parseFlatName($name_str); extract($name_arr); $person = new Entity(); $person->addExtension('Person'); $person->name_prefix = $name_prefix; $person->name_first = $name_first; $person->name_middle = $name_middle; $person->name_last = $name_last; $person->name_suffix = $name_suffix; $person->name_nick = $name_nick; return $person; }
private function importAddress($address_arr, $person, $person_arr, $corp_name) { $a = new Address(); $a->street1 = LsLanguage::nameize($address_arr['street1']); $a->street2 = LsLanguage::nameize($address_arr['street2']); $a->city = $address_arr['city']; $a->Category = Doctrine::getTable('AddressCategory')->findOneByName('Mailing'); if ($state = AddressStateTable::retrieveByText($address_arr['state'])) { $a->State = $state; } else { return; } $a->postal = $address_arr['postal']; $modifiedFields = $a->getAllModifiedFields(); if ($person->addAddress($a)) { $person->save(); $a->addReference($person_arr['form4Url'], null, null, $corp_name . ' Form 4', null, $person_arr['date']); } }
public function getCommitteeEntityByFecId($id) { $name = null; //get name from FEC.gov $url = $this->fecCommitteeBaseUrl . $id; $this->browser->get($url); if ($this->browser->responseIsError()) { return null; } $page = $this->browser->getResponseText(); if (!preg_match('#<FONT SIZE=5><B>([^<]+)</B></FONT>#', $page, $match)) { return null; } $name = LsLanguage::titleize($match[1]); //see if there's an entity with PoliticalFundraising extension and this name $sql = 'SELECT e.* FROM entity e ' . 'LEFT JOIN alias a ON (a.entity_id = e.id) ' . 'WHERE a.name = ? AND e.primary_ext = ? AND e.is_deleted = 0'; $stmt = $this->db->execute($sql, array($name, 'Org')); if (!($entity = $stmt->fetch(PDO::FETCH_ASSOC))) { if ($this->debugMode) { print "+ Creating new entity for committee " . $id . " (" . $name . ")\n"; } $entity = new Entity(); $entity->addExtension('Org'); $entity->addExtension('PoliticalFundraising'); $entity->name = $name; $entity->fec_id = $id; $entity->save(); //get CRP's names for this committee //$sql = 'SELECT DISTINCT name from os_committee WHERE committee_id = ?'; //$stmt = $this->rawDb->execute($sql, array($id)); //$names = $stmt->fetchAll(PDO::FETCH_COLUMN); //$this->addAliasesToEntityById($entity['id'], $names); } return $entity; }
protected function import($url) { $person = null; $this->printDebug($url); if (!$this->browser->get($url)->responseIsError()) { $text = $this->browser->getResponseText(); $bio = null; $name = null; $netWorth = null; $birthYear = null; $schools = null; $schools = null; $imageUrl = null; $rank = null; //get name & rank if ($this->year > 2005 && preg_match('/<b>#(\\d+) ([^<]+)<\\/b>/', $text, $match)) { $name = trim($match[2]); $rank = $match[1]; } if ($this->year == 2005 && preg_match('/<h2>#(\\d+) ([^<]+)<\\/h2>/', $text, $match)) { $name = trim($match[2]); $rank = $match[1]; } //get net worth if (preg_match('/Net Worth<\\/span> <span class="red">\\$([\\S]+) billion/', $text, $match)) { $netWorth = $match[1] * 1000000000; } //get birth year if (preg_match('/>Age<\\/span> (\\d+)/', $text, $match)) { $birthYear = date("Y") - $match[1] . "-00-00"; } //get schools if (preg_match('/Education<\\/span>(.*)<\\/td>/isU', $text, $match)) { $schools = array(); $schoolParts = explode('<br>', $match[1]); while ($schoolPart = current($schoolParts)) { if (preg_match('/^([^,]+),\\s+<b>([^<]+)<\\/b>/is', trim($schoolPart), $match)) { $schoolOrg = trim($match[1]); if ($schoolOrg == 'High School') { next($schoolParts); continue; } $schoolDegree = trim($match[2]); $schools[] = array('org' => $schoolOrg, 'degree' => $schoolDegree); } next($schoolParts); } } if (preg_match('#<br>[\\n\\s]<br>(.+?)<br>[\\n\\s]<br>[\\n\\s]<img#isU', $text, $match)) { $bio = strip_tags(trim($match[1])); } else { $wikipedia = new LsWikipedia(); if ($wikipedia->request($name)) { $bio = $wikipedia->getIntroduction(); } } //get image $regexp = '#([A-Z1-9]{4}).html#'; if (preg_match($regexp, $url, $match)) { $imageFilename = $match[1] . ".jpg"; $imageUrl = $this->list_urls[$this->year]['img_src'] . $imageFilename; } //echo "Rank: " . $rank . "\n"; $this->printDebug("Rank: " . $rank); $this->printDebug("Name: " . $name); $this->printDebug("Image: " . $imageUrl); $this->printDebug("Net worth: " . $netWorth); $this->printDebug("Birth year: " . $birthYear); $this->printDebug("Bio: " . $bio); $person = $this->generatePerson($name, $bio); $person_exists = $this->getBusinessPersonQuery()->addWhere("person.name_first = ? AND person.name_last = ?", array($person->name_first, $person->name_last))->fetchOne(); if ($person_exists != false) { $this->printDebug('Person exists'); $person = $person_exists; } else { $this->printDebug('Saving new person'); } //parse name and create person object $person->addExtension('BusinessPerson'); $person->start_date = $person->start_date == null ? $birthYear : $person->start_date; $person->summary = $person->summary == null ? $bio : $person->summary; $person->net_worth = $person->net_worth == null ? $netWorth : $person->net_worth; //go through schools person attended foreach ($schools as $school) { //does the current school exist? $current_school = EntityTable::getByExtensionQuery('Org')->addWhere("org.name = ?", $school['org'])->fetchOne(); if ($current_school) { $this->printDebug(" Found School " . $school['org']); } else { //clear cache Doctrine::getTable('ExtensionDefinition')->clear(); $current_school = new Entity(); $current_school->addExtension('Org'); $current_school->addExtension('School'); $current_school->name = LsLanguage::titleize($school['org']); $current_school->save(); $current_school->addReference($source = $url, $excerpt = null, $fields = array('name'), $name = 'Forbes.com', $detail = null, $date = null); $this->printDebug(" Adding new school: " . $school['org']); } //if there is no relationship between person and school. connect them! if (!$person->getRelationshipsWithQuery($current_school, RelationshipTable::EDUCATION_CATEGORY)->fetchOne()) { $this->printDebug(" Creating Relation between " . $current_school->name . " and " . $person->name); $education = new Relationship(); $education->Entity1 = $person; $education->Entity2 = $current_school; $education->setCategory('Education'); $education->description1 = $school['degree']; $education->is_current = 1; $education->save(); $education->addReference($source = $url, $excerpt = null, $fields = array('description1'), $name = 'Forbes.com', $detail = null, $date = null); } } $person->save(); $person->addReference($source = $url, $excerpt = null, $fields = array('name_prefix', 'name_first', 'name_middle', 'name_last', 'name_suffix', 'name_nick', 'summary', 'net_worth', 'start_date'), $name = 'Forbes.com', $detail = null, $date = null); $this->saveToList($person, $rank); $this->attachImage($person, $imageUrl); } else { echo "Couldn't get person: " . $url . "\n"; } }
/** * get donor info */ private function generateDonor($text) { $text_arr = explode("<BR>", $text); //var_dump($text_arr[0]); $donor = $this->generatePerson(LsHtml::stripTags($text_arr[0], '')); $address_arr = LsLanguage::parseCityStatePostal($text_arr[1]); $a = new Address(); $a->street1 = isset($address_arr['street1']) ? $address_arr['street1'] : null; $a->street2 = isset($address_arr['street2']) ? $address_arr['street2'] : null; $a->city = $address_arr['city']; if ($state = AddressStateTable::retrieveByText($address_arr['state'])) { $a->State = $state; } $a->postal = $address_arr['zip']; $donor->addAddress($a); $donor->summary = strip_tags(trim($text_arr[2])); return $donor; }
public function execute() { foreach ($this->corp_ids as $corp_id) { try { $this->db->beginTransaction(); $this->corp = Doctrine::getTable('Entity')->find($corp_id); if (!$this->corp->sec_cik) { if ($result = $this->getCik($this->corp->ticker)) { $this->corp->sec_cik = $result['cik']; if (!$this->corp->Industry->count()) { if ($result['sic']['name'] && $result['sic']['name'] != '') { $q = LsDoctrineQuery::create()->from('Industry i')->where('i.name = ? and i.code = ?', array($result['sic']['name'], $result['sic']['code']))->fetchOne(); if (!($industry = $q->fetchOne())) { $industry = new Industry(); $industry->name = LsLanguage::nameize(LsHtml::replaceEntities($result['sic']['name'])); $industry->context = 'SIC'; $industry->code = $result['sic']['code']; $industry->save(); } $q = LsQuery::getByModelAndFieldsQuery('BusinessIndustry', array('industry_id' => $industry->id, 'business_id' => $this->corp->id)); if (!$q->fetchOne()) { $this->corp->Industry[] = $industry; } } $this->corp->save(); $this->corp->addReference($result['url'], null, $corp->getAllModifiedFields(), 'SEC EDGAR Page'); } } $this->corp->save(); } if ($this->corp->sec_cik) { $category = Doctrine::getTable('RelationshipCategory')->findOneByName('Position'); $this->people = $this->corp->getRelatedEntitiesQuery('Person', $category->id, 'Director', null, null, false)->execute(); if (count($this->people) > 1) { if ($this->need_proxy) { $this->getProxy(); $this->need_proxy = true; } if ($this->url) { $this->paginate(); if ($this->pages) { $this->printDebug('paginated'); $this->findNamePages(); $this->findBasicInfo(); } else { $this->saveMeta($this->corp->id, 'error', 'not_paginated'); $this->printDebug('not paginated'); } } else { $this->saveMeta($this->corp->id, 'error', 'no_proxy_retrieved'); $this->printDebug('could not get proxy'); } } } $this->saveMeta($this->corp->id, 'scraped', '1'); if (!$this->testMode) { $this->db->commit(); } else { $this->db->rollback(); } } catch (Exception $e) { //something bad happened, rollback $this->db->rollback(); throw $e; } } }
<?php foreach ($stats as $heading => $rows) { ?> <?php foreach ($rows as $row) { ?> <?php if (!isset($filter) || !$filter || $row['num'] > 100) { ?> <tr> <td style="text-align: right; padding-right: 0.5em;"> <strong><?php echo format_number($row['num']); ?> </strong> </td> <td> <?php echo LsLanguage::pluralize($row['display_name']); ?> </td> </tr> <?php } ?> <?php } } ?> </table> </div>
public function prepLobbyistName($str) { //get rid of extra spaces and stuff in parens $str = trim(preg_replace(array('/\\([^\\)]*\\)?/s', '/\\s+/s'), array('', ' '), $str)); $name_parts = explode(',', $str); //no comma, no parsable name (for now) if (count($name_parts) < 2) { return null; } $name_last = trim(array_shift($name_parts)); $name_rest = trim(implode(' ', $name_parts)); /*$person = new Entity; $person->addExtension('Person'); $person->addExtension('Lobbyist'); $person->name_last = trim(array_shift($name_parts)); $name_rest = trim(implode(' ',$name_parts));*/ $name_nick = null; if (preg_match('/["\'](.*?)["\']/isu', $name_rest, $match, PREG_OFFSET_CAPTURE) == 1) { $name_nick = $match[1][0]; $name_rest = str_replace($match[0][0], '', $name_rest); } $name_suffix = null; $suffixes = PersonTable::$nameParseSuffixes; while ($suffix = current($suffixes)) { if ($name_rest != ($new = preg_replace('/ ' . $suffix . '$/i', '', $name_rest))) { $name_suffix = $suffix . ' ' . $name_suffix; $name_rest = trim($new); reset($suffixes); continue; } next($suffixes); } $name_suffix = $name_suffix ? trim($name_suffix) : null; $person = PersonTable::parseFlatName($name_rest . ' ' . $name_last, $name_last); if ($name_nick) { $person->name_nick = LsLanguage::nameize($name_nick); } if ($name_suffix) { $person->name_suffix = $name_suffix; } $person->addExtension('Lobbyist'); $person->name_last = trim($person->name_last); if (!$person->name_last || $person->name_last == '') { return null; } return $person; }