private function getWebsite($org) { $this->printDebug($org->name); $query = $org->name; $google_scraper = new LsGoogle(); $google_scraper->setQuery(trim($query)); $google_scraper->execute(); $results = $google_scraper->getResults(); foreach ($results as $result) { preg_match('/http\\:\\/\\/[^\\/]+\\//isu', $result->unescapedUrl, $match); if (!$match) { continue; } $trimmed_url = $match[0]; if ($this->checkUrl($trimmed_url, $org->name)) { $this->printDebug('passed: ' . $result->url); //titleNoFormatting); //$this->printDebug($result->url); //$this->printDebug($result->content); $people = $org->getRelatedEntitiesQuery('Person')->execute(); $num_arr = array(); $people_ct = 0; $multi = false; foreach ($people as $p) { $q = 'site:' . $trimmed_url . ' "' . $p->name_first . ' ' . $p->name_last . '"'; $this->printDebug($q); $google_scraper->setQuery($q); $google_scraper->execute(); $num = $google_scraper->getNumResults(); if ($num > 0) { $people_ct++; if ($num > 1) { $multi++; } } if ($people_ct > 1) { break; } } if ($people_ct == 0) { $this->printDebug('no people found'); } else { if (!$org->website) { $org->website = $trimmed_url; $this->printDebug('website saved for ' . $org->name . ': ' . $trimmed_url); } $org->save(); break; } $this->printDebug(''); } else { $this->printDebug('failed: ' . $result->url); } } $this->printDebug('***'); }
public function findKeys($entity) { $search_terms = $entity->nameSearch() . " site:twitter.com"; $goog = new LsGoogle(); $goog->setQuery($search_terms); $results = $goog->execute(); $results = $goog->parseSearchResults($results); $matches = array(); if ($results) { foreach ($results as $r) { $match = array(); $url = $r['unescapedUrl']; if (preg_match('/(.+?)\\(.*?\\)\\s+on\\s+Twitter$/is', $r['titleNoFormatting'], $m_name)) { $match['name'] = trim($m_name[1]); if (preg_match('/twitter.com\\/(\\#\\/)?([^\\/]+)$/is', $url, $m)) { $match['id'] = $m[2]; $matches[] = $match; } } } } return $matches; }
protected function getPhotoFromGoogleImages(Entity $person) { if ($this->imageExists($person)) { return true; } //construct search query $query = null; if ($person->name_middle) { $query = '"' . $person->name . '"'; } else { if ($org = $person->getRelatedEntitiesQuery('Org', RelationshipTable::POSITION_CATEGORY)->fetchOne()) { $query = '"' . $person->name . '" ' . $org->name; } else { $query = '"' . $person->name . '"'; } } $google = new LsGoogle(); $google->setService('images'); $google->setParameter('imgtype', 'face'); $google->setQuery($query); $google->execute(); $results = $google->getResults(); if ($google->getNumResults() == 0) { return false; } foreach ($results as $key => $result) { $image_url = $result->url; $image_content = $result->contentNoFormatting; if (preg_match('/(jpg|jpeg|gif)$/i', $image_url)) { $this->printDebug("Checking " . basename($image_url)); $basefilename = basename($image_url); if (stristr($basefilename . " " . $image_content, $person->getNameLast())) { //Entity $entity, $url, $title = 'title', $caption='caption', $is_featured = 1, $is_free = 0 $this->printDebug("Imported " . $image_url); return $this->attachImage($person, $image_url, 'Photograph'); } } } $this->printDebug("Photo not found"); return false; }
private function getLobbyistInfo($org) { $people = $org->getRelatedEntitiesQuery('Person', RelationshipTable::POSITION_CATEGORY, null, null, null, false)->addWhere('summary is NULL or summary = ?', '')->execute(); $google_scraper = new LsGoogle(); $ct = 0; foreach ($people as $person) { if ($ct > 30) { return null; } $this->printDebug("\n******************\n"); $bio = null; $image = null; $query = 'site:' . $org->website . ' ' . $person->name; $this->printDebug('Query: ' . $query); $google_scraper->setQuery(trim($query)); $google_scraper->execute(); if ($google_scraper->getNumResults()) { $results = $google_scraper->getResults(); $match_sets = array(); $this->stopTimer(); $last = $this->timer->getElapsedTime(); $this->beginTimer(); foreach ($results as $result) { $this->stopTimer(); $now = $this->timer->getElapsedTime(); $diff = $now - $last; $this->printDebug($diff); $last = $now; $this->beginTimer(); if ($diff > 30) { try { $this->db->beginTransaction(); $this->saveMeta($org->id, 'timeout', 1); $this->printDebug('TIMEOUT======================================='); if (!$this->testMode) { $this->db->commit(); } else { $this->db->rollback(); } } catch (Exception $e) { $this->db->rollback(); } return null; } if (0) { $url = $result->cacheUrl; } else { $url = $result->unescapedUrl; } $this->printDebug($url); if (preg_match('/\\.pdf$/is', $url)) { $this->printDebug("PDF, skipping ({$url})\n----------------"); continue; } try { $error = $this->browser->get($url)->responseIsError(); } catch (Exception $e) { continue; } if (!$error) { $this->printDebug('checking: ' . $url); $page = $this->browser->getResponseText(); $page = LsHtml::replaceEntities($page); if (!$bio) { if ($bio = $this->findPersonBio($page, $person, $org)) { try { $this->db->beginTransaction(); $person->summary = $bio; $person->save(); $person->addReference($url, null, array('summary'), $org->name . ' website'); $this->printDebug("\nBIO FOUND & SAVED: " . $bio . "\n"); $ct = 0; if (!$this->testMode) { $this->db->commit(); } else { $this->db->rollback(); } } catch (Exception $e) { $this->db->rollback(); throw $e; } } else { $this->printDebug('no bio'); } } if (!$image) { if ($image = $this->findPersonImage($page, $person, $org)) { $this->printDebug('IMAGE FOUND: ' . $image['url']); preg_match('/(http\\:\\/\\/[^\\/]+)\\//is', $url, $match); $root_url = $match[1]; $image_url = null; if (preg_match('/^http/is', $image['url'])) { $image_url = $image['url']; } else { $pos = strrpos($url, '/'); if ($pos > 8) { $trimmed_url = substr($url, 0, $pos); } else { $trimmed_url = $url; } if (preg_match('/^\\//is', $image['url'])) { $image_url = $root_url . $image['url']; } else { if (preg_match('/^((\\.\\.\\/)+)(.+)/is', $image['url'], $match)) { $num_steps = strlen($match[1]) / 3; for ($i = 0; $i < $num_steps; $i++) { $trimmed_url = substr($trimmed_url, 0, strrpos($trimmed_url, '/')); } $image_url = $trimmed_url . '/' . $match[3]; } else { $image_url = $trimmed_url . '/' . $image['url']; } } } if ($image_url) { $this->printDebug($image_url); if ($fileName = ImageTable::createFiles($image_url, $person->name)) { //insert image record try { $this->db->beginTransaction(); $image = new Image(); $image->filename = $fileName; $image->entity_id = $person->id; $image->title = $person->name; $image->caption = 'From ' . $org->name . '\'s website.'; $image->is_featured = true; $image->is_free = false; $image->url = $image_url; $q = LsDoctrineQuery::create()->from('Image i')->where('i.entity_id = ?', $person->id)->addWhere('i.title =?', $person->name)->addWhere('i.caption =?', $image->caption); if (count($q->execute()) == 0) { $image->save(); $image->addReference($image_url, null, array('filename'), $org->name . ' website'); if (!$bio) { $person->addReference($url, null, null, $org->name . ' website'); } $this->printDebug("Imported image: " . $image->filename); $ct = 0; } if (!$this->testMode) { $this->db->commit(); } else { $this->db->rollback(); } } catch (Exception $e) { $this->db->rollback(); throw $e; } } } } else { $this->printDebug('no image'); } } if ($bio && $image) { break; } } else { $this->printDebug('response is error: ' . $url); } $this->printDebug('-------------'); } if (count($match_sets)) { //var_dump($match_sets); } } else { $this->printDebug("No results found \n"); } if (!$image && !$bio) { $ct++; } } }
protected function getLogoFromGoogleImage(Entity $org) { if ($this->imageExists($org)) { return true; } //construct search query $nameParts = array_diff(explode(' ', $org->name), array_merge(LsLanguage::$business, LsLanguage::$businessAbbreviations)); $cleanName = trim(implode(' ', $nameParts)); $query = $cleanName . ' logo'; $this->printDebug("Querying Google with term: " . $query); $google = new LsGoogle(); $google->setService('images'); $google->setQuery($query); $google->execute(); $results = $google->getResults(); foreach ($results as $key => $result) { $image_url = $result->url; $image_content = $result->contentNoFormatting; $this->printDebug("Checking: " . $image_url); if (preg_match('/(png|gif|jpg)$/i', $image_url)) { $this->printDebug("Checking " . $image_url); $basefilename = basename($image_url); //$organization_name_parts = array_diff(explode(' ', strtolower($org->name)), array_merge( LsLanguage::$business, LsLanguage::$businessAbbreviations)); $organization_name_parts = split("[ \\.\\_\\-]", strtolower($org->name)); $organization_name_parts[] = "logo"; $organization_name_parts[] = "seal"; $organization_match_parts = LsArray::arrayTrim(split("[ \\.\\_\\-]", preg_replace("/[0-9]/", "", strtolower(basename(urldecode($basefilename)) . " " . urldecode($image_content))))); $intersect = array_intersect($organization_name_parts, $organization_match_parts); //var_dump($organization_name_parts); //var_dump($organization_match_parts); //var_dump($intersect); if (count($intersect) >= 2) { //Entity $entity, $url, $title = 'title', $caption='caption', $is_featured = 1, $is_free = 0 $attached = $this->attachImage($org, $image_url, 'Organization logo'); if ($attached) { $this->printDebug("Saved"); return true; } } } } $this->printDebug("Logo not found on Google"); return false; }
function getBusinessWeek(Entity $person) { /* $yahoo = new LsYahoo; $yahoo->setService('Web Search'); $yahoo->setSite('http://investing.businessweek.com'); $yahoo->setQuery($person->name); $this->printDebug($yahoo->getQueryUrl()); $yahoo->execute(); $results = $yahoo->getResults(); */ $google_scraper = new LsGoogle(); $google_scraper->setQuery('site:investing.businessweek.com ' . $person->name); $this->printDebug('site:investing.businessweek.com ' . $person->name); $google_scraper->execute(); if (!$google_scraper->getNumResults()) { return null; } $results = $google_scraper->getResults(); $businessweek_profile = null; foreach ($results as $result) { $this->printDebug($result->unescapedUrl); if (preg_match('/^.*?person\\.asp\\?personId=\\d+/is', $result->unescapedUrl, $match)) { $businessweek_profile = $match[0]; break; } } if (!$businessweek_profile) { foreach ($results as $result) { $url = $result->unescapedUrl; if (preg_match('/^(.*?)\\&/is', $url, $match)) { $url = $match[1]; } if (!stristr($url, 'http://')) { $url = 'http://investing.businessweek.com/' . $url; } $this->printDebug('new url: ' . $url); if (!$this->browser->get($url)->responseIsError()) { $text = $this->browser->getResponseText(); //var_dump($text); $links = LsHtml::matchLinks($text); foreach ($links as $link) { if (preg_match('/' . $person->getNameRegex(true) . '/s', $link['text']) && preg_match('/^.*?person\\.asp\\?personId=\\d+/is', $link['url'], $match)) { $url = $match[0]; if (!stristr($url, 'http://')) { $url = 'http://investing.businessweek.com/' . $url; } $businessweek_profile = $url; break; } } if ($businessweek_profile) { $this->printDebug('Businessweek profile found on 2nd attempt: ' . $businessweek_profile); break; } } } if (!$businessweek_profile) { $this->printDebug('Buisnessweek profile not found'); return; } } $education_found = false; $employment_found = false; $summary_found = false; $ed_matched = false; //go to businessweek profile and get education $this->browser->get($businessweek_profile); if ($text = $this->browser->getResponseText()) { //$education = null; //$employment = null; if (preg_match('#EDUCATION[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<h2#is', $text, $education)) { $ed_matched = preg_match_all('/<strong>(.+?)<\\/strong>\\s*(\\d{4})?\\s*<\\/div><div.*?>(.+?)</s', $education[1], $education_found); } if (preg_match('#OTHER AFFILIATIONS[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<\\/td#s', $text, $employment)) { preg_match_all('#href\\=\\".+?\\"\\>(.+?)\\<\\/a\\>#is', $employment[1], $employment_found); } preg_match('#BACKGROUND[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<\\/p>#s', $text, $summary_found); $summary_found = strip_tags($summary_found[1]); //var_dump($summary_found); if ($ed_matched) { $this->printDebug('Education info found at Businessweek'); } else { $this->printDebug('Education info not found at Businessweek'); return; } } else { $this->printDebug('Businessweek browser error'); return; } $education_history = null; $employment_history = null; $wikipedia = new LsWikipedia(); $wikipedia->request($person->name); $wikipedia->execute(); $plaintext = $wikipedia->getPlainText(); foreach ($education_found[3] as $key => $institution) { $arr = null; $arr['institution'] = $institution; $arr['degree'] = $education_found[1][$key]; $arr['year'] = null; if ($education_found[2][$key] != '') { $arr['year'] = $education_found[2][$key]; } $wikipedia_matches = LsLanguage::getCommonPronouns($arr['institution'], $plaintext, array_merge(LsLanguage::$business, LsLanguage::$schools, LsLanguage::$grammar)); if ($wikipedia_matches) { $arr['source'] = 'http://en.wikipedia.org/wiki/' . str_replace('+', '_', $wikipedia->getTitle()); } else { $arr['source'] = $businessweek_profile; } $education_history[] = (object) $arr; } foreach ($employment_found[1] as $key => $company) { $arr = null; $arr['company'] = $company; $arr['title'] = null; $employment_history[] = (object) $arr; } $possible_person = array('name' => $person->name, 'summary' => $summary_found, 'employment_history' => (object) $employment_history, 'education' => (object) $education_history); $possible_persons[] = (object) $possible_person; $this->import($person, $possible_persons); }
public function checkBoardPage($board_rels) { $goog = new LsGoogle(); $goog->setQuery($this->entity->name . " board"); $results = $goog->execute(); $results = $goog->parseSearchResults($results); if (count($results)) { $url = $results[0]['unescapedUrl']; if (stripos($url, "yahoo.com")) { $url = $results[1]['unescapedUrl']; } $this->printDebug($url); try { if (!$this->browser->get($url)->responseIsError()) { $text = $this->browser->getResponseText(); $text = LsHtml::replaceEntities($text); $status_arr = array(); $unique_arr = array(); foreach ($board_rels as $br) { $found = 0; //$this->printDebug($br->Entity1->getNameRegex()); $regexes = $br->Entity1->getNameRegexes(); foreach ($regexes as $regex) { if (preg_match_all($regex, $text, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)) { //var_dump($matches); $found = 1; if (!in_array($br->entity1_id, $unique_arr)) { $unique_arr[] = $br->entity1_id; } break; } } $status_arr[] = $found; $this->printDebug($br->Entity1->name . " > " . $found); } if (count($unique_arr) > 1) { $this->printDebug("\tenough board member names found to mark as current or not"); for ($i = 0; $i < count($status_arr); $i++) { $br = $board_rels[$i]; if ($status_arr[$i] == 1) { $br->is_current = 1; } else { $br->is_current = 0; } if (!$this->testMode) { $br->save(); $br->addReference($url, null, null, $this->entity->name . ' board', null, null); } } return 1; } else { return 0; } } } catch (Exception $e) { return 0; } } else { return -1; } }