protected function getPhotoFromWikipedia(Entity $person)
 {
     if ($this->imageExists($person)) {
         return true;
     }
     //construct search query
     $wikipedia = new LsWikipedia();
     $response = $wikipedia->requestImages($person->name);
     if (!$response) {
         return false;
     }
     $matches = $response->query->pages->page;
     $this->printDebug("Images on Wikipedia page: " . count($matches));
     foreach ($matches as $match) {
         $ii = (array) $match->imageinfo->ii;
         $attributes = $ii['@attributes'];
         $photo_url = $attributes['url'];
         $photo_alt = $attributes['comment'];
         //var_dump($photo_alt);
         $this->printDebug("Checking: " . $photo_url);
         if (preg_match('/^http.*' . $person->name_last . '.*(jpg|jpeg|gif)$/i', $photo_url)) {
             $this->printDebug("Photo found on URL: " . $photo_url);
             return $this->attachImage($person, $photo_url, 'Photograph');
         }
     }
     $this->printDebug("Photo not found");
     return false;
 }
 protected function import($url)
 {
     $person = null;
     $this->printDebug($url);
     if (!$this->browser->get($url)->responseIsError()) {
         $text = $this->browser->getResponseText();
         $bio = null;
         $name = null;
         $netWorth = null;
         $birthYear = null;
         $schools = null;
         $schools = null;
         $imageUrl = null;
         $rank = null;
         //get name & rank
         if ($this->year > 2005 && preg_match('/<b>#(\\d+) ([^<]+)<\\/b>/', $text, $match)) {
             $name = trim($match[2]);
             $rank = $match[1];
         }
         if ($this->year == 2005 && preg_match('/<h2>#(\\d+) ([^<]+)<\\/h2>/', $text, $match)) {
             $name = trim($match[2]);
             $rank = $match[1];
         }
         //get net worth
         if (preg_match('/Net Worth<\\/span> <span class="red">\\$([\\S]+) billion/', $text, $match)) {
             $netWorth = $match[1] * 1000000000;
         }
         //get birth year
         if (preg_match('/>Age<\\/span> (\\d+)/', $text, $match)) {
             $birthYear = date("Y") - $match[1] . "-00-00";
         }
         //get schools
         if (preg_match('/Education<\\/span>(.*)<\\/td>/isU', $text, $match)) {
             $schools = array();
             $schoolParts = explode('<br>', $match[1]);
             while ($schoolPart = current($schoolParts)) {
                 if (preg_match('/^([^,]+),\\s+<b>([^<]+)<\\/b>/is', trim($schoolPart), $match)) {
                     $schoolOrg = trim($match[1]);
                     if ($schoolOrg == 'High School') {
                         next($schoolParts);
                         continue;
                     }
                     $schoolDegree = trim($match[2]);
                     $schools[] = array('org' => $schoolOrg, 'degree' => $schoolDegree);
                 }
                 next($schoolParts);
             }
         }
         if (preg_match('#<br>[\\n\\s]<br>(.+?)<br>[\\n\\s]<br>[\\n\\s]<img#isU', $text, $match)) {
             $bio = strip_tags(trim($match[1]));
         } else {
             $wikipedia = new LsWikipedia();
             if ($wikipedia->request($name)) {
                 $bio = $wikipedia->getIntroduction();
             }
         }
         //get image
         $regexp = '#([A-Z1-9]{4}).html#';
         if (preg_match($regexp, $url, $match)) {
             $imageFilename = $match[1] . ".jpg";
             $imageUrl = $this->list_urls[$this->year]['img_src'] . $imageFilename;
         }
         //echo "Rank: " . $rank . "\n";
         $this->printDebug("Rank: " . $rank);
         $this->printDebug("Name: " . $name);
         $this->printDebug("Image: " . $imageUrl);
         $this->printDebug("Net worth: " . $netWorth);
         $this->printDebug("Birth year: " . $birthYear);
         $this->printDebug("Bio: " . $bio);
         $person = $this->generatePerson($name, $bio);
         $person_exists = $this->getBusinessPersonQuery()->addWhere("person.name_first = ? AND person.name_last = ?", array($person->name_first, $person->name_last))->fetchOne();
         if ($person_exists != false) {
             $this->printDebug('Person exists');
             $person = $person_exists;
         } else {
             $this->printDebug('Saving new person');
         }
         //parse name and create person object
         $person->addExtension('BusinessPerson');
         $person->start_date = $person->start_date == null ? $birthYear : $person->start_date;
         $person->summary = $person->summary == null ? $bio : $person->summary;
         $person->net_worth = $person->net_worth == null ? $netWorth : $person->net_worth;
         //go through schools person attended
         foreach ($schools as $school) {
             //does the current school exist?
             $current_school = EntityTable::getByExtensionQuery('Org')->addWhere("org.name = ?", $school['org'])->fetchOne();
             if ($current_school) {
                 $this->printDebug("  Found School " . $school['org']);
             } else {
                 //clear cache
                 Doctrine::getTable('ExtensionDefinition')->clear();
                 $current_school = new Entity();
                 $current_school->addExtension('Org');
                 $current_school->addExtension('School');
                 $current_school->name = LsLanguage::titleize($school['org']);
                 $current_school->save();
                 $current_school->addReference($source = $url, $excerpt = null, $fields = array('name'), $name = 'Forbes.com', $detail = null, $date = null);
                 $this->printDebug("  Adding new school: " . $school['org']);
             }
             //if there is no relationship between person and school. connect them!
             if (!$person->getRelationshipsWithQuery($current_school, RelationshipTable::EDUCATION_CATEGORY)->fetchOne()) {
                 $this->printDebug("  Creating Relation between " . $current_school->name . " and " . $person->name);
                 $education = new Relationship();
                 $education->Entity1 = $person;
                 $education->Entity2 = $current_school;
                 $education->setCategory('Education');
                 $education->description1 = $school['degree'];
                 $education->is_current = 1;
                 $education->save();
                 $education->addReference($source = $url, $excerpt = null, $fields = array('description1'), $name = 'Forbes.com', $detail = null, $date = null);
             }
         }
         $person->save();
         $person->addReference($source = $url, $excerpt = null, $fields = array('name_prefix', 'name_first', 'name_middle', 'name_last', 'name_suffix', 'name_nick', 'summary', 'net_worth', 'start_date'), $name = 'Forbes.com', $detail = null, $date = null);
         $this->saveToList($person, $rank);
         $this->attachImage($person, $imageUrl);
     } else {
         echo "Couldn't get person: " . $url . "\n";
     }
 }
Beispiel #3
0
 protected function getLogoFromWikipedia(Entity $org)
 {
     if ($this->imageExists($org)) {
         return true;
     }
     //construct search query
     $wikipedia = new LsWikipedia();
     $response = $wikipedia->requestImages($org->name);
     if (!$response) {
         return false;
     }
     $matches = $response->query->pages->page;
     $this->printDebug("Images on Wikipedia page: " . count($matches));
     foreach ($matches as $match) {
         $ii = (array) $match->imageinfo->ii;
         $attributes = $ii['@attributes'];
         $logo_image_url = $attributes['url'];
         $logo_image_alt = $attributes['comment'];
         if (preg_match('/^http.*(Wikinews|Wikiversity|Wikisource|Wikiquote|Wikibooks|Wiktionary|Commons-logo)/i', $logo_image_url)) {
             continue;
         }
         $this->printDebug("Checking:  " . $logo_image_url);
         $org_name_parts = split("[ \\.\\_\\-]", strtolower($org->name));
         $org_name_parts[] = "logo";
         $org_name_parts[] = "seal";
         $image_name_parts = split("[ \\.\\_\\-]", preg_replace("/[0-9]/", "", strtolower(basename(urldecode($logo_image_url)))));
         $intersect = array_intersect($image_name_parts, $org_name_parts);
         if (count($intersect) >= 2) {
             if ($this->attachImage($org, $logo_image_url, 'Organization logo')) {
                 $this->printDebug("Saved");
                 return true;
             }
         }
         /*
         if((preg_match('/^http.*(logo|seal).*(png|gif|svg|jpg)$/i', $logo_image_url)) )
         {
           $this->printDebug( "Found 1");
           if($this->attachImage($org, $logo_image_url, 'Organization logo'))
           {
             $this->printDebug( "Committed");
             return true;
           }
         }
         
         if( (preg_match('/(png|gif|svg|jpg)$/i',$logo_image_url) && preg_match('/\b(logo|seal)\b/i',$logo_image_alt)) )
         {
           $this->printDebug( "Found 2");
           if($this->attachImage($org, $logo_image_url, 'Organization logo'))
           {
             $this->printDebug( "Committed");
             return true;
           }
         }
         
         if(preg_match('/^http.*'.str_replace(LsLanguage::$punctuations, '_', $org->name).'.+\.(png|gif|svg)$/i', $logo_image_url))
         {
           $this->printDebug( "Found 3");
           if( $this->attachImage($org, $logo_image_url, 'Organization logo'))
           {
             $this->printDebug( "Committed");
             return true;  
           }        
         }
         */
     }
     $this->printDebug("Logo not found on Wikipedia");
     return false;
 }
 function import(Entity $person, $possible_persons)
 {
     //loop through the people we found. usually just one.
     foreach ($possible_persons as $possible_person) {
         $this->printDebug('Query returned ' . count($possible_person) . ' person named ' . $possible_person->name);
         //this person does not provide education. we skip
         if (count($possible_person->education)) {
             $this->printDebug('Education found');
         } else {
             $this->printDebug('No education history found');
             continue;
         }
         //get employement info for this possible match
         $possible_person_bio = $possible_person->summary;
         if (count($possible_person->employment_history)) {
             foreach ($possible_person->employment_history as $employment) {
                 $possible_person_bio .= ' ' . $employment->company . " ";
             }
             $this->printDebug('Employment found');
         } else {
             $this->printDebug('No employment history found');
             continue;
         }
         //get employment info for the person in our database
         $relationship_orgs = $person->getRelatedEntitiesQuery('Org', RelationshipTable::POSITION_CATEGORY, null, null, null, false, 1)->execute();
         $person_bio = $person->summary;
         foreach ($relationship_orgs as $org) {
             $person_bio .= ' ' . $org->name;
         }
         //lets see how many matches we get
         $matches = LsLanguage::getCommonPronouns($person_bio, trim($possible_person_bio), LsLanguage::$business);
         if (count($matches)) {
             foreach ($possible_person->education as $school) {
                 $school->institution = mb_convert_encoding($school->institution, 'UTF-8');
                 $school->institution = preg_replace('/–/isu', ' ', $school->institution);
                 $this->printDebug('Looking for the school: ' . $school->institution);
                 $current_school = EntityTable::findByAlias($school->institution, $context = 'bw_school');
                 //find school
                 if ($current_school) {
                     $this->printDebug('Found school');
                 } else {
                     $current_school = EntityTable::getByExtensionQuery(array('Org', 'School'))->addWhere('LOWER(org.name) LIKE ?', '%' . strtolower($school->institution) . "%")->fetchOne();
                     if (!$current_school) {
                         $new_school = new Entity();
                         $new_school->addExtension('Org');
                         $new_school->addExtension('School');
                         $new_school->name = $school->institution;
                         $wikipedia = new LsWikipedia();
                         $wikipedia->request($school->institution);
                         if ($wikipedia->execute() && !$wikipedia->isDisambiguation()) {
                             $info_box = $wikipedia->getInfoBox();
                             if (isset($info_box['students']) && preg_match('/([\\d\\,]{2,})/isu', $info_box['students']['clean'], $match)) {
                                 $new_school->students = LsNumber::clean($match[1]);
                             } else {
                                 $student_types = array('undergrad', 'postgrad', 'grad', 'doctoral');
                                 $num_students = 0;
                                 foreach ($student_types as $st) {
                                     if (isset($info_box[$st]) && preg_match('/([\\d\\,]{2,})/isu', $info_box[$st]['clean'], $match)) {
                                         $num_students += LsNumber::clean($match[1]);
                                     }
                                 }
                                 if ($num_students > 0) {
                                     $new_school->students = $num_students;
                                 }
                             }
                             if (isset($info_box['faculty']) && preg_match('/([\\d\\,]{2,})/isu', $info_box['faculty']['clean'], $match)) {
                                 $new_school->faculty = LsNumber::clean($match[1]);
                             }
                             if (isset($info_box['type'])) {
                                 if (stristr($info_box['type']['clean'], 'public')) {
                                     $new_school->is_private = 0;
                                 } else {
                                     if (stristr($info_box['type']['clean'], 'private')) {
                                         $new_school->is_private = 1;
                                     }
                                 }
                             }
                             if (isset($info_box['endowment'])) {
                                 if (preg_match('/(\\$[\\d\\,\\.\\s]+)(million|billion)/isu', $info_box['endowment']['clean'], $match)) {
                                     if (strtolower($match[2]) == 'billion') {
                                         $factor = 1000000000;
                                     } else {
                                         $factor = 1000000;
                                     }
                                     $new_school->endowment = LsNumber::formatDollarAmountAsNumber($match[1], $factor);
                                 }
                             }
                             if (isset($info_box['established'])) {
                                 $year = null;
                                 if ($date = LsDate::convertDate($info_box['established']['clean'])) {
                                     $new_school->start_date = $date;
                                 } else {
                                     if (preg_match('/\\b(\\d\\d\\d\\d)\\b/isu', $info_box['established']['clean'], $match)) {
                                         $new_school->start_date = $match[1];
                                     }
                                 }
                             }
                             $summary = trim($wikipedia->getIntroduction());
                             $summary = preg_replace('/\\n\\s*\\n/isu', '', $summary);
                             if (strlen($summary) > 10) {
                                 $new_school->summary = $summary;
                             }
                             $new_school->save();
                             $new_school->addReference($source = $wikipedia->getUrl(), $excerpt = null, $fields = array('summary'), $name = 'Wikipedia');
                         } else {
                             $new_school->save();
                         }
                         $current_school = $new_school;
                         $this->printDebug('Adding new school');
                     }
                     $alias = new Alias();
                     $alias->name = $school->institution;
                     $alias->context = 'bw_school';
                     $alias->Entity = $current_school;
                     $alias->save();
                 }
                 //find degree
                 $degree = null;
                 if (!($degree = DegreeTable::getByText($school->degree))) {
                     $degree = DegreeTable::addDegree($school->degree);
                     $this->printDebug('Adding new degree');
                 }
                 //find relationship
                 $relationship = null;
                 $relationships = $person->getRelationshipsWithQuery($current_school, RelationshipTable::EDUCATION_CATEGORY)->execute();
                 foreach ($relationships as $existing_relationship) {
                     if ($existing_relationship->degree_id == $degree->id) {
                         $relationship = $existing_relationship;
                         break;
                     }
                 }
                 if ($relationship) {
                     $this->printDebug('Relationship between person and school exists');
                 } else {
                     $relationship = new Relationship();
                     $relationship->Entity1 = $person;
                     $relationship->Entity2 = $current_school;
                     $relationship->description1 = 'student';
                     $relationship->is_current = 0;
                     if ($school->year) {
                         $relationship->end_date = $school->year;
                     }
                     $relationship->setCategory('Education');
                     $this->printDebug('Creating new relationship between person and school');
                 }
                 //save
                 $relationship->save();
                 //add degree and reference
                 if ($relationship->degree_id == null) {
                     $reference_name = strstr($school->source, 'wikipedia') ? "Wikipedia" : "BusinessWeek";
                     $relationship->Degree = $degree;
                     $relationship->save();
                     $relationship->addReference($source = $school->source, $excerpt = null, $fields = array('degree_id'), $name = $reference_name, $detail = null, $date = null);
                     $this->printDebug('Adding degree and reference');
                 }
             }
         } else {
             $this->printDebug('No organization matches');
             return false;
         }
     }
     return true;
 }