protected function import($url) { $person = null; $this->printDebug($url); if (!$this->browser->get($url)->responseIsError()) { $text = $this->browser->getResponseText(); $bio = null; $name = null; $netWorth = null; $birthYear = null; $schools = null; $schools = null; $imageUrl = null; $rank = null; //get name & rank if ($this->year > 2005 && preg_match('/<b>#(\\d+) ([^<]+)<\\/b>/', $text, $match)) { $name = trim($match[2]); $rank = $match[1]; } if ($this->year == 2005 && preg_match('/<h2>#(\\d+) ([^<]+)<\\/h2>/', $text, $match)) { $name = trim($match[2]); $rank = $match[1]; } //get net worth if (preg_match('/Net Worth<\\/span> <span class="red">\\$([\\S]+) billion/', $text, $match)) { $netWorth = $match[1] * 1000000000; } //get birth year if (preg_match('/>Age<\\/span> (\\d+)/', $text, $match)) { $birthYear = date("Y") - $match[1] . "-00-00"; } //get schools if (preg_match('/Education<\\/span>(.*)<\\/td>/isU', $text, $match)) { $schools = array(); $schoolParts = explode('<br>', $match[1]); while ($schoolPart = current($schoolParts)) { if (preg_match('/^([^,]+),\\s+<b>([^<]+)<\\/b>/is', trim($schoolPart), $match)) { $schoolOrg = trim($match[1]); if ($schoolOrg == 'High School') { next($schoolParts); continue; } $schoolDegree = trim($match[2]); $schools[] = array('org' => $schoolOrg, 'degree' => $schoolDegree); } next($schoolParts); } } if (preg_match('#<br>[\\n\\s]<br>(.+?)<br>[\\n\\s]<br>[\\n\\s]<img#isU', $text, $match)) { $bio = strip_tags(trim($match[1])); } else { $wikipedia = new LsWikipedia(); if ($wikipedia->request($name)) { $bio = $wikipedia->getIntroduction(); } } //get image $regexp = '#([A-Z1-9]{4}).html#'; if (preg_match($regexp, $url, $match)) { $imageFilename = $match[1] . ".jpg"; $imageUrl = $this->list_urls[$this->year]['img_src'] . $imageFilename; } //echo "Rank: " . $rank . "\n"; $this->printDebug("Rank: " . $rank); $this->printDebug("Name: " . $name); $this->printDebug("Image: " . $imageUrl); $this->printDebug("Net worth: " . $netWorth); $this->printDebug("Birth year: " . $birthYear); $this->printDebug("Bio: " . $bio); $person = $this->generatePerson($name, $bio); $person_exists = $this->getBusinessPersonQuery()->addWhere("person.name_first = ? AND person.name_last = ?", array($person->name_first, $person->name_last))->fetchOne(); if ($person_exists != false) { $this->printDebug('Person exists'); $person = $person_exists; } else { $this->printDebug('Saving new person'); } //parse name and create person object $person->addExtension('BusinessPerson'); $person->start_date = $person->start_date == null ? $birthYear : $person->start_date; $person->summary = $person->summary == null ? $bio : $person->summary; $person->net_worth = $person->net_worth == null ? $netWorth : $person->net_worth; //go through schools person attended foreach ($schools as $school) { //does the current school exist? $current_school = EntityTable::getByExtensionQuery('Org')->addWhere("org.name = ?", $school['org'])->fetchOne(); if ($current_school) { $this->printDebug(" Found School " . $school['org']); } else { //clear cache Doctrine::getTable('ExtensionDefinition')->clear(); $current_school = new Entity(); $current_school->addExtension('Org'); $current_school->addExtension('School'); $current_school->name = LsLanguage::titleize($school['org']); $current_school->save(); $current_school->addReference($source = $url, $excerpt = null, $fields = array('name'), $name = 'Forbes.com', $detail = null, $date = null); $this->printDebug(" Adding new school: " . $school['org']); } //if there is no relationship between person and school. connect them! if (!$person->getRelationshipsWithQuery($current_school, RelationshipTable::EDUCATION_CATEGORY)->fetchOne()) { $this->printDebug(" Creating Relation between " . $current_school->name . " and " . $person->name); $education = new Relationship(); $education->Entity1 = $person; $education->Entity2 = $current_school; $education->setCategory('Education'); $education->description1 = $school['degree']; $education->is_current = 1; $education->save(); $education->addReference($source = $url, $excerpt = null, $fields = array('description1'), $name = 'Forbes.com', $detail = null, $date = null); } } $person->save(); $person->addReference($source = $url, $excerpt = null, $fields = array('name_prefix', 'name_first', 'name_middle', 'name_last', 'name_suffix', 'name_nick', 'summary', 'net_worth', 'start_date'), $name = 'Forbes.com', $detail = null, $date = null); $this->saveToList($person, $rank); $this->attachImage($person, $imageUrl); } else { echo "Couldn't get person: " . $url . "\n"; } }
function import(Entity $person, $possible_persons) { //loop through the people we found. usually just one. foreach ($possible_persons as $possible_person) { $this->printDebug('Query returned ' . count($possible_person) . ' person named ' . $possible_person->name); //this person does not provide education. we skip if (count($possible_person->education)) { $this->printDebug('Education found'); } else { $this->printDebug('No education history found'); continue; } //get employement info for this possible match $possible_person_bio = $possible_person->summary; if (count($possible_person->employment_history)) { foreach ($possible_person->employment_history as $employment) { $possible_person_bio .= ' ' . $employment->company . " "; } $this->printDebug('Employment found'); } else { $this->printDebug('No employment history found'); continue; } //get employment info for the person in our database $relationship_orgs = $person->getRelatedEntitiesQuery('Org', RelationshipTable::POSITION_CATEGORY, null, null, null, false, 1)->execute(); $person_bio = $person->summary; foreach ($relationship_orgs as $org) { $person_bio .= ' ' . $org->name; } //lets see how many matches we get $matches = LsLanguage::getCommonPronouns($person_bio, trim($possible_person_bio), LsLanguage::$business); if (count($matches)) { foreach ($possible_person->education as $school) { $school->institution = mb_convert_encoding($school->institution, 'UTF-8'); $school->institution = preg_replace('//isu', ' ', $school->institution); $this->printDebug('Looking for the school: ' . $school->institution); $current_school = EntityTable::findByAlias($school->institution, $context = 'bw_school'); //find school if ($current_school) { $this->printDebug('Found school'); } else { $current_school = EntityTable::getByExtensionQuery(array('Org', 'School'))->addWhere('LOWER(org.name) LIKE ?', '%' . strtolower($school->institution) . "%")->fetchOne(); if (!$current_school) { $new_school = new Entity(); $new_school->addExtension('Org'); $new_school->addExtension('School'); $new_school->name = $school->institution; $wikipedia = new LsWikipedia(); $wikipedia->request($school->institution); if ($wikipedia->execute() && !$wikipedia->isDisambiguation()) { $info_box = $wikipedia->getInfoBox(); if (isset($info_box['students']) && preg_match('/([\\d\\,]{2,})/isu', $info_box['students']['clean'], $match)) { $new_school->students = LsNumber::clean($match[1]); } else { $student_types = array('undergrad', 'postgrad', 'grad', 'doctoral'); $num_students = 0; foreach ($student_types as $st) { if (isset($info_box[$st]) && preg_match('/([\\d\\,]{2,})/isu', $info_box[$st]['clean'], $match)) { $num_students += LsNumber::clean($match[1]); } } if ($num_students > 0) { $new_school->students = $num_students; } } if (isset($info_box['faculty']) && preg_match('/([\\d\\,]{2,})/isu', $info_box['faculty']['clean'], $match)) { $new_school->faculty = LsNumber::clean($match[1]); } if (isset($info_box['type'])) { if (stristr($info_box['type']['clean'], 'public')) { $new_school->is_private = 0; } else { if (stristr($info_box['type']['clean'], 'private')) { $new_school->is_private = 1; } } } if (isset($info_box['endowment'])) { if (preg_match('/(\\$[\\d\\,\\.\\s]+)(million|billion)/isu', $info_box['endowment']['clean'], $match)) { if (strtolower($match[2]) == 'billion') { $factor = 1000000000; } else { $factor = 1000000; } $new_school->endowment = LsNumber::formatDollarAmountAsNumber($match[1], $factor); } } if (isset($info_box['established'])) { $year = null; if ($date = LsDate::convertDate($info_box['established']['clean'])) { $new_school->start_date = $date; } else { if (preg_match('/\\b(\\d\\d\\d\\d)\\b/isu', $info_box['established']['clean'], $match)) { $new_school->start_date = $match[1]; } } } $summary = trim($wikipedia->getIntroduction()); $summary = preg_replace('/\\n\\s*\\n/isu', '', $summary); if (strlen($summary) > 10) { $new_school->summary = $summary; } $new_school->save(); $new_school->addReference($source = $wikipedia->getUrl(), $excerpt = null, $fields = array('summary'), $name = 'Wikipedia'); } else { $new_school->save(); } $current_school = $new_school; $this->printDebug('Adding new school'); } $alias = new Alias(); $alias->name = $school->institution; $alias->context = 'bw_school'; $alias->Entity = $current_school; $alias->save(); } //find degree $degree = null; if (!($degree = DegreeTable::getByText($school->degree))) { $degree = DegreeTable::addDegree($school->degree); $this->printDebug('Adding new degree'); } //find relationship $relationship = null; $relationships = $person->getRelationshipsWithQuery($current_school, RelationshipTable::EDUCATION_CATEGORY)->execute(); foreach ($relationships as $existing_relationship) { if ($existing_relationship->degree_id == $degree->id) { $relationship = $existing_relationship; break; } } if ($relationship) { $this->printDebug('Relationship between person and school exists'); } else { $relationship = new Relationship(); $relationship->Entity1 = $person; $relationship->Entity2 = $current_school; $relationship->description1 = 'student'; $relationship->is_current = 0; if ($school->year) { $relationship->end_date = $school->year; } $relationship->setCategory('Education'); $this->printDebug('Creating new relationship between person and school'); } //save $relationship->save(); //add degree and reference if ($relationship->degree_id == null) { $reference_name = strstr($school->source, 'wikipedia') ? "Wikipedia" : "BusinessWeek"; $relationship->Degree = $degree; $relationship->save(); $relationship->addReference($source = $school->source, $excerpt = null, $fields = array('degree_id'), $name = $reference_name, $detail = null, $date = null); $this->printDebug('Adding degree and reference'); } } } else { $this->printDebug('No organization matches'); return false; } } return true; }