protected function importRows($text) { $rows = array(); //if (preg_match_all('/<p>\s*<strong>([^<]*)<\/strong>\s*<br>\s*<a\s+href\="([^"]+)">([^<]*)</isu',$text,$matches, PREG_SET_ORDER)) if (preg_match_all('/<tr\\s+height\\="25" bgcolor="#ffffff">\\s*<td.*?>(.*?)<\\/td><td.*?>(.*?)<\\/td><td.*?>(.*?)<\\/td><td.*?>(.*?)<\\/td>\\s*<\\/tr>/su', $text, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { array_shift($match); $row = array(); foreach ($match as &$m) { $m = trim(str_replace(' ', ' ', $m)); //$this->printDebug($m); } $links = LsHtml::matchLinks($match[0]); $row['name'] = $links[0]['text']; $row['url'] = $links[0]['url']; $row['state'] = $match[1]; if (preg_match_all('/\\d\\d\\d\\d/', $match[2], $years)) { $row['years'] = $years[0]; } $row['party'] = $match[3]; $rows[] = $row; } } $this->_rows = $rows; }
function getBusinessWeek(Entity $person) { /* $yahoo = new LsYahoo; $yahoo->setService('Web Search'); $yahoo->setSite('http://investing.businessweek.com'); $yahoo->setQuery($person->name); $this->printDebug($yahoo->getQueryUrl()); $yahoo->execute(); $results = $yahoo->getResults(); */ $google_scraper = new LsGoogle(); $google_scraper->setQuery('site:investing.businessweek.com ' . $person->name); $this->printDebug('site:investing.businessweek.com ' . $person->name); $google_scraper->execute(); if (!$google_scraper->getNumResults()) { return null; } $results = $google_scraper->getResults(); $businessweek_profile = null; foreach ($results as $result) { $this->printDebug($result->unescapedUrl); if (preg_match('/^.*?person\\.asp\\?personId=\\d+/is', $result->unescapedUrl, $match)) { $businessweek_profile = $match[0]; break; } } if (!$businessweek_profile) { foreach ($results as $result) { $url = $result->unescapedUrl; if (preg_match('/^(.*?)\\&/is', $url, $match)) { $url = $match[1]; } if (!stristr($url, 'http://')) { $url = 'http://investing.businessweek.com/' . $url; } $this->printDebug('new url: ' . $url); if (!$this->browser->get($url)->responseIsError()) { $text = $this->browser->getResponseText(); //var_dump($text); $links = LsHtml::matchLinks($text); foreach ($links as $link) { if (preg_match('/' . $person->getNameRegex(true) . '/s', $link['text']) && preg_match('/^.*?person\\.asp\\?personId=\\d+/is', $link['url'], $match)) { $url = $match[0]; if (!stristr($url, 'http://')) { $url = 'http://investing.businessweek.com/' . $url; } $businessweek_profile = $url; break; } } if ($businessweek_profile) { $this->printDebug('Businessweek profile found on 2nd attempt: ' . $businessweek_profile); break; } } } if (!$businessweek_profile) { $this->printDebug('Buisnessweek profile not found'); return; } } $education_found = false; $employment_found = false; $summary_found = false; $ed_matched = false; //go to businessweek profile and get education $this->browser->get($businessweek_profile); if ($text = $this->browser->getResponseText()) { //$education = null; //$employment = null; if (preg_match('#EDUCATION[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<h2#is', $text, $education)) { $ed_matched = preg_match_all('/<strong>(.+?)<\\/strong>\\s*(\\d{4})?\\s*<\\/div><div.*?>(.+?)</s', $education[1], $education_found); } if (preg_match('#OTHER AFFILIATIONS[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<\\/td#s', $text, $employment)) { preg_match_all('#href\\=\\".+?\\"\\>(.+?)\\<\\/a\\>#is', $employment[1], $employment_found); } preg_match('#BACKGROUND[\\*]?<\\/h2>[\\n\\s]*(.+?)\\<\\/p>#s', $text, $summary_found); $summary_found = strip_tags($summary_found[1]); //var_dump($summary_found); if ($ed_matched) { $this->printDebug('Education info found at Businessweek'); } else { $this->printDebug('Education info not found at Businessweek'); return; } } else { $this->printDebug('Businessweek browser error'); return; } $education_history = null; $employment_history = null; $wikipedia = new LsWikipedia(); $wikipedia->request($person->name); $wikipedia->execute(); $plaintext = $wikipedia->getPlainText(); foreach ($education_found[3] as $key => $institution) { $arr = null; $arr['institution'] = $institution; $arr['degree'] = $education_found[1][$key]; $arr['year'] = null; if ($education_found[2][$key] != '') { $arr['year'] = $education_found[2][$key]; } $wikipedia_matches = LsLanguage::getCommonPronouns($arr['institution'], $plaintext, array_merge(LsLanguage::$business, LsLanguage::$schools, LsLanguage::$grammar)); if ($wikipedia_matches) { $arr['source'] = 'http://en.wikipedia.org/wiki/' . str_replace('+', '_', $wikipedia->getTitle()); } else { $arr['source'] = $businessweek_profile; } $education_history[] = (object) $arr; } foreach ($employment_found[1] as $key => $company) { $arr = null; $arr['company'] = $company; $arr['title'] = null; $employment_history[] = (object) $arr; } $possible_person = array('name' => $person->name, 'summary' => $summary_found, 'employment_history' => (object) $employment_history, 'education' => (object) $education_history); $possible_persons[] = (object) $possible_person; $this->import($person, $possible_persons); }
public function setWikiTables() { $wikiTables = array(); if (preg_match_all('/<table\\s+class\\="wikitable"[^>]*>(.*?)<\\/table/isu', $this->_content, $matches)) { $table_count = 1; foreach ($matches[1] as $table_match) { $table = array(); if (preg_match_all('/<tr[^>]*>(.*?)<\\/tr/isu', $table_match, $row_matches)) { $row_count = 1; foreach ($row_matches[1] as $row_match) { $row = array(); if (preg_match_all('/<td[^>]*>(.*?)<\\/td/isu', $row_match, $cell_matches)) { $cell_count = 1; foreach ($cell_matches[1] as $cell_match) { $cell = array(); $cell['str'] = LsString::spacesToSpace(LsHtml::replaceEntities(LsHtml::stripTags($cell_match))); $cell['links'] = LsHtml::matchLinks($cell_match); $row['cell' . $cell_count] = $cell; $cell_count++; } } $table['row' . $row_count] = $row; $row_count++; } } $wikiTables['table' . $table_count] = $table; $table_count++; } } $this->_wikiTables = $wikiTables; }