示例#1
0
 private function getWebsite($org)
 {
     $this->printDebug($org->name);
     $query = $org->name;
     $google_scraper = new LsGoogle();
     $google_scraper->setQuery(trim($query));
     $google_scraper->execute();
     $results = $google_scraper->getResults();
     foreach ($results as $result) {
         preg_match('/http\\:\\/\\/[^\\/]+\\//isu', $result->unescapedUrl, $match);
         if (!$match) {
             continue;
         }
         $trimmed_url = $match[0];
         if ($this->checkUrl($trimmed_url, $org->name)) {
             $this->printDebug('passed: ' . $result->url);
             //titleNoFormatting);
             //$this->printDebug($result->url);
             //$this->printDebug($result->content);
             $people = $org->getRelatedEntitiesQuery('Person')->execute();
             $num_arr = array();
             $people_ct = 0;
             $multi = false;
             foreach ($people as $p) {
                 $q = 'site:' . $trimmed_url . ' "' . $p->name_first . ' ' . $p->name_last . '"';
                 $this->printDebug($q);
                 $google_scraper->setQuery($q);
                 $google_scraper->execute();
                 $num = $google_scraper->getNumResults();
                 if ($num > 0) {
                     $people_ct++;
                     if ($num > 1) {
                         $multi++;
                     }
                 }
                 if ($people_ct > 1) {
                     break;
                 }
             }
             if ($people_ct == 0) {
                 $this->printDebug('no people found');
             } else {
                 if (!$org->website) {
                     $org->website = $trimmed_url;
                     $this->printDebug('website saved for ' . $org->name . ': ' . $trimmed_url);
                 }
                 $org->save();
                 break;
             }
             $this->printDebug('');
         } else {
             $this->printDebug('failed: ' . $result->url);
         }
     }
     $this->printDebug('***');
 }
示例#2
0
 public function findKeys($entity)
 {
     $search_terms = $entity->nameSearch() . " site:twitter.com";
     $goog = new LsGoogle();
     $goog->setQuery($search_terms);
     $results = $goog->execute();
     $results = $goog->parseSearchResults($results);
     $matches = array();
     if ($results) {
         foreach ($results as $r) {
             $match = array();
             $url = $r['unescapedUrl'];
             if (preg_match('/(.+?)\\(.*?\\)\\s+on\\s+Twitter$/is', $r['titleNoFormatting'], $m_name)) {
                 $match['name'] = trim($m_name[1]);
                 if (preg_match('/twitter.com\\/(\\#\\/)?([^\\/]+)$/is', $url, $m)) {
                     $match['id'] = $m[2];
                     $matches[] = $match;
                 }
             }
         }
     }
     return $matches;
 }
 protected function getPhotoFromGoogleImages(Entity $person)
 {
     if ($this->imageExists($person)) {
         return true;
     }
     //construct search query
     $query = null;
     if ($person->name_middle) {
         $query = '"' . $person->name . '"';
     } else {
         if ($org = $person->getRelatedEntitiesQuery('Org', RelationshipTable::POSITION_CATEGORY)->fetchOne()) {
             $query = '"' . $person->name . '" ' . $org->name;
         } else {
             $query = '"' . $person->name . '"';
         }
     }
     $google = new LsGoogle();
     $google->setService('images');
     $google->setParameter('imgtype', 'face');
     $google->setQuery($query);
     $google->execute();
     $results = $google->getResults();
     if ($google->getNumResults() == 0) {
         return false;
     }
     foreach ($results as $key => $result) {
         $image_url = $result->url;
         $image_content = $result->contentNoFormatting;
         if (preg_match('/(jpg|jpeg|gif)$/i', $image_url)) {
             $this->printDebug("Checking " . basename($image_url));
             $basefilename = basename($image_url);
             if (stristr($basefilename . " " . $image_content, $person->getNameLast())) {
                 //Entity $entity, $url, $title = 'title', $caption='caption', $is_featured = 1, $is_free = 0
                 $this->printDebug("Imported " . $image_url);
                 return $this->attachImage($person, $image_url, 'Photograph');
             }
         }
     }
     $this->printDebug("Photo not found");
     return false;
 }
 private function getLobbyistInfo($org)
 {
     $people = $org->getRelatedEntitiesQuery('Person', RelationshipTable::POSITION_CATEGORY, null, null, null, false)->addWhere('summary is NULL or summary = ?', '')->execute();
     $google_scraper = new LsGoogle();
     $ct = 0;
     foreach ($people as $person) {
         if ($ct > 30) {
             return null;
         }
         $this->printDebug("\n******************\n");
         $bio = null;
         $image = null;
         $query = 'site:' . $org->website . ' ' . $person->name;
         $this->printDebug('Query: ' . $query);
         $google_scraper->setQuery(trim($query));
         $google_scraper->execute();
         if ($google_scraper->getNumResults()) {
             $results = $google_scraper->getResults();
             $match_sets = array();
             $this->stopTimer();
             $last = $this->timer->getElapsedTime();
             $this->beginTimer();
             foreach ($results as $result) {
                 $this->stopTimer();
                 $now = $this->timer->getElapsedTime();
                 $diff = $now - $last;
                 $this->printDebug($diff);
                 $last = $now;
                 $this->beginTimer();
                 if ($diff > 30) {
                     try {
                         $this->db->beginTransaction();
                         $this->saveMeta($org->id, 'timeout', 1);
                         $this->printDebug('TIMEOUT=======================================');
                         if (!$this->testMode) {
                             $this->db->commit();
                         } else {
                             $this->db->rollback();
                         }
                     } catch (Exception $e) {
                         $this->db->rollback();
                     }
                     return null;
                 }
                 if (0) {
                     $url = $result->cacheUrl;
                 } else {
                     $url = $result->unescapedUrl;
                 }
                 $this->printDebug($url);
                 if (preg_match('/\\.pdf$/is', $url)) {
                     $this->printDebug("PDF, skipping ({$url})\n----------------");
                     continue;
                 }
                 try {
                     $error = $this->browser->get($url)->responseIsError();
                 } catch (Exception $e) {
                     continue;
                 }
                 if (!$error) {
                     $this->printDebug('checking: ' . $url);
                     $page = $this->browser->getResponseText();
                     $page = LsHtml::replaceEntities($page);
                     if (!$bio) {
                         if ($bio = $this->findPersonBio($page, $person, $org)) {
                             try {
                                 $this->db->beginTransaction();
                                 $person->summary = $bio;
                                 $person->save();
                                 $person->addReference($url, null, array('summary'), $org->name . ' website');
                                 $this->printDebug("\nBIO FOUND & SAVED: " . $bio . "\n");
                                 $ct = 0;
                                 if (!$this->testMode) {
                                     $this->db->commit();
                                 } else {
                                     $this->db->rollback();
                                 }
                             } catch (Exception $e) {
                                 $this->db->rollback();
                                 throw $e;
                             }
                         } else {
                             $this->printDebug('no bio');
                         }
                     }
                     if (!$image) {
                         if ($image = $this->findPersonImage($page, $person, $org)) {
                             $this->printDebug('IMAGE FOUND: ' . $image['url']);
                             preg_match('/(http\\:\\/\\/[^\\/]+)\\//is', $url, $match);
                             $root_url = $match[1];
                             $image_url = null;
                             if (preg_match('/^http/is', $image['url'])) {
                                 $image_url = $image['url'];
                             } else {
                                 $pos = strrpos($url, '/');
                                 if ($pos > 8) {
                                     $trimmed_url = substr($url, 0, $pos);
                                 } else {
                                     $trimmed_url = $url;
                                 }
                                 if (preg_match('/^\\//is', $image['url'])) {
                                     $image_url = $root_url . $image['url'];
                                 } else {
                                     if (preg_match('/^((\\.\\.\\/)+)(.+)/is', $image['url'], $match)) {
                                         $num_steps = strlen($match[1]) / 3;
                                         for ($i = 0; $i < $num_steps; $i++) {
                                             $trimmed_url = substr($trimmed_url, 0, strrpos($trimmed_url, '/'));
                                         }
                                         $image_url = $trimmed_url . '/' . $match[3];
                                     } else {
                                         $image_url = $trimmed_url . '/' . $image['url'];
                                     }
                                 }
                             }
                             if ($image_url) {
                                 $this->printDebug($image_url);
                                 if ($fileName = ImageTable::createFiles($image_url, $person->name)) {
                                     //insert image record
                                     try {
                                         $this->db->beginTransaction();
                                         $image = new Image();
                                         $image->filename = $fileName;
                                         $image->entity_id = $person->id;
                                         $image->title = $person->name;
                                         $image->caption = 'From ' . $org->name . '\'s website.';
                                         $image->is_featured = true;
                                         $image->is_free = false;
                                         $image->url = $image_url;
                                         $q = LsDoctrineQuery::create()->from('Image i')->where('i.entity_id = ?', $person->id)->addWhere('i.title =?', $person->name)->addWhere('i.caption =?', $image->caption);
                                         if (count($q->execute()) == 0) {
                                             $image->save();
                                             $image->addReference($image_url, null, array('filename'), $org->name . ' website');
                                             if (!$bio) {
                                                 $person->addReference($url, null, null, $org->name . ' website');
                                             }
                                             $this->printDebug("Imported image: " . $image->filename);
                                             $ct = 0;
                                         }
                                         if (!$this->testMode) {
                                             $this->db->commit();
                                         } else {
                                             $this->db->rollback();
                                         }
                                     } catch (Exception $e) {
                                         $this->db->rollback();
                                         throw $e;
                                     }
                                 }
                             }
                         } else {
                             $this->printDebug('no image');
                         }
                     }
                     if ($bio && $image) {
                         break;
                     }
                 } else {
                     $this->printDebug('response is error: ' . $url);
                 }
                 $this->printDebug('-------------');
             }
             if (count($match_sets)) {
                 //var_dump($match_sets);
             }
         } else {
             $this->printDebug("No results found \n");
         }
         if (!$image && !$bio) {
             $ct++;
         }
     }
 }
示例#5
0
 protected function getLogoFromGoogleImage(Entity $org)
 {
     if ($this->imageExists($org)) {
         return true;
     }
     //construct search query
     $nameParts = array_diff(explode(' ', $org->name), array_merge(LsLanguage::$business, LsLanguage::$businessAbbreviations));
     $cleanName = trim(implode(' ', $nameParts));
     $query = $cleanName . ' logo';
     $this->printDebug("Querying Google with term: " . $query);
     $google = new LsGoogle();
     $google->setService('images');
     $google->setQuery($query);
     $google->execute();
     $results = $google->getResults();
     foreach ($results as $key => $result) {
         $image_url = $result->url;
         $image_content = $result->contentNoFormatting;
         $this->printDebug("Checking: " . $image_url);
         if (preg_match('/(png|gif|jpg)$/i', $image_url)) {
             $this->printDebug("Checking " . $image_url);
             $basefilename = basename($image_url);
             //$organization_name_parts = array_diff(explode(' ', strtolower($org->name)), array_merge( LsLanguage::$business, LsLanguage::$businessAbbreviations));
             $organization_name_parts = split("[ \\.\\_\\-]", strtolower($org->name));
             $organization_name_parts[] = "logo";
             $organization_name_parts[] = "seal";
             $organization_match_parts = LsArray::arrayTrim(split("[ \\.\\_\\-]", preg_replace("/[0-9]/", "", strtolower(basename(urldecode($basefilename)) . " " . urldecode($image_content)))));
             $intersect = array_intersect($organization_name_parts, $organization_match_parts);
             //var_dump($organization_name_parts);
             //var_dump($organization_match_parts);
             //var_dump($intersect);
             if (count($intersect) >= 2) {
                 //Entity $entity, $url, $title = 'title', $caption='caption', $is_featured = 1, $is_free = 0
                 $attached = $this->attachImage($org, $image_url, 'Organization logo');
                 if ($attached) {
                     $this->printDebug("Saved");
                     return true;
                 }
             }
         }
     }
     $this->printDebug("Logo not found on Google");
     return false;
 }
 function getBusinessWeek(Entity $person)
 {
     /*
         $yahoo = new LsYahoo;
         $yahoo->setService('Web Search');
         $yahoo->setSite('http://investing.businessweek.com');
         $yahoo->setQuery($person->name);
         $this->printDebug($yahoo->getQueryUrl());
     
         $yahoo->execute();    
         $results = $yahoo->getResults();  */
     $google_scraper = new LsGoogle();
     $google_scraper->setQuery('site:investing.businessweek.com ' . $person->name);
     $this->printDebug('site:investing.businessweek.com ' . $person->name);
     $google_scraper->execute();
     if (!$google_scraper->getNumResults()) {
         return null;
     }
     $results = $google_scraper->getResults();
     $businessweek_profile = null;
     foreach ($results as $result) {
         $this->printDebug($result->unescapedUrl);
         if (preg_match('/^.*?person\\.asp\\?personId=\\d+/is', $result->unescapedUrl, $match)) {
             $businessweek_profile = $match[0];
             break;
         }
     }
     if (!$businessweek_profile) {
         foreach ($results as $result) {
             $url = $result->unescapedUrl;
             if (preg_match('/^(.*?)\\&/is', $url, $match)) {
                 $url = $match[1];
             }
             if (!stristr($url, 'http://')) {
                 $url = 'http://investing.businessweek.com/' . $url;
             }
             $this->printDebug('new url: ' . $url);
             if (!$this->browser->get($url)->responseIsError()) {
                 $text = $this->browser->getResponseText();
                 //var_dump($text);
                 $links = LsHtml::matchLinks($text);
                 foreach ($links as $link) {
                     if (preg_match('/' . $person->getNameRegex(true) . '/s', $link['text']) && preg_match('/^.*?person\\.asp\\?personId=\\d+/is', $link['url'], $match)) {
                         $url = $match[0];
                         if (!stristr($url, 'http://')) {
                             $url = 'http://investing.businessweek.com/' . $url;
                         }
                         $businessweek_profile = $url;
                         break;
                     }
                 }
                 if ($businessweek_profile) {
                     $this->printDebug('Businessweek profile found on 2nd attempt: ' . $businessweek_profile);
                     break;
                 }
             }
         }
         if (!$businessweek_profile) {
             $this->printDebug('Buisnessweek profile not found');
             return;
         }
     }
     $education_found = false;
     $employment_found = false;
     $summary_found = false;
     $ed_matched = false;
     //go to businessweek profile and get education
     $this->browser->get($businessweek_profile);
     if ($text = $this->browser->getResponseText()) {
         //$education = null;
         //$employment = null;
         if (preg_match('#EDUCATION[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<h2#is', $text, $education)) {
             $ed_matched = preg_match_all('/<strong>(.+?)<\\/strong>\\s*(\\d{4})?\\s*<\\/div><div.*?>(.+?)</s', $education[1], $education_found);
         }
         if (preg_match('#OTHER AFFILIATIONS[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<\\/td#s', $text, $employment)) {
             preg_match_all('#href\\=\\".+?\\"\\>(.+?)\\<\\/a\\>#is', $employment[1], $employment_found);
         }
         preg_match('#BACKGROUND[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<\\/p>#s', $text, $summary_found);
         $summary_found = strip_tags($summary_found[1]);
         //var_dump($summary_found);
         if ($ed_matched) {
             $this->printDebug('Education info found at Businessweek');
         } else {
             $this->printDebug('Education info not found at Businessweek');
             return;
         }
     } else {
         $this->printDebug('Businessweek browser error');
         return;
     }
     $education_history = null;
     $employment_history = null;
     $wikipedia = new LsWikipedia();
     $wikipedia->request($person->name);
     $wikipedia->execute();
     $plaintext = $wikipedia->getPlainText();
     foreach ($education_found[3] as $key => $institution) {
         $arr = null;
         $arr['institution'] = $institution;
         $arr['degree'] = $education_found[1][$key];
         $arr['year'] = null;
         if ($education_found[2][$key] != '') {
             $arr['year'] = $education_found[2][$key];
         }
         $wikipedia_matches = LsLanguage::getCommonPronouns($arr['institution'], $plaintext, array_merge(LsLanguage::$business, LsLanguage::$schools, LsLanguage::$grammar));
         if ($wikipedia_matches) {
             $arr['source'] = 'http://en.wikipedia.org/wiki/' . str_replace('+', '_', $wikipedia->getTitle());
         } else {
             $arr['source'] = $businessweek_profile;
         }
         $education_history[] = (object) $arr;
     }
     foreach ($employment_found[1] as $key => $company) {
         $arr = null;
         $arr['company'] = $company;
         $arr['title'] = null;
         $employment_history[] = (object) $arr;
     }
     $possible_person = array('name' => $person->name, 'summary' => $summary_found, 'employment_history' => (object) $employment_history, 'education' => (object) $education_history);
     $possible_persons[] = (object) $possible_person;
     $this->import($person, $possible_persons);
 }
 public function checkBoardPage($board_rels)
 {
     $goog = new LsGoogle();
     $goog->setQuery($this->entity->name . " board");
     $results = $goog->execute();
     $results = $goog->parseSearchResults($results);
     if (count($results)) {
         $url = $results[0]['unescapedUrl'];
         if (stripos($url, "yahoo.com")) {
             $url = $results[1]['unescapedUrl'];
         }
         $this->printDebug($url);
         try {
             if (!$this->browser->get($url)->responseIsError()) {
                 $text = $this->browser->getResponseText();
                 $text = LsHtml::replaceEntities($text);
                 $status_arr = array();
                 $unique_arr = array();
                 foreach ($board_rels as $br) {
                     $found = 0;
                     //$this->printDebug($br->Entity1->getNameRegex());
                     $regexes = $br->Entity1->getNameRegexes();
                     foreach ($regexes as $regex) {
                         if (preg_match_all($regex, $text, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)) {
                             //var_dump($matches);
                             $found = 1;
                             if (!in_array($br->entity1_id, $unique_arr)) {
                                 $unique_arr[] = $br->entity1_id;
                             }
                             break;
                         }
                     }
                     $status_arr[] = $found;
                     $this->printDebug($br->Entity1->name . " > " . $found);
                 }
                 if (count($unique_arr) > 1) {
                     $this->printDebug("\tenough board member names found to mark as current or not");
                     for ($i = 0; $i < count($status_arr); $i++) {
                         $br = $board_rels[$i];
                         if ($status_arr[$i] == 1) {
                             $br->is_current = 1;
                         } else {
                             $br->is_current = 0;
                         }
                         if (!$this->testMode) {
                             $br->save();
                             $br->addReference($url, null, null, $this->entity->name . ' board', null, null);
                         }
                     }
                     return 1;
                 } else {
                     return 0;
                 }
             }
         } catch (Exception $e) {
             return 0;
         }
     } else {
         return -1;
     }
 }