/** * get donor info */ private function generateDonor($text) { $text_arr = explode("<BR>", $text); //var_dump($text_arr[0]); $donor = $this->generatePerson(LsHtml::stripTags($text_arr[0], '')); $address_arr = LsLanguage::parseCityStatePostal($text_arr[1]); $a = new Address(); $a->street1 = isset($address_arr['street1']) ? $address_arr['street1'] : null; $a->street2 = isset($address_arr['street2']) ? $address_arr['street2'] : null; $a->city = $address_arr['city']; if ($state = AddressStateTable::retrieveByText($address_arr['state'])) { $a->State = $state; } $a->postal = $address_arr['zip']; $donor->addAddress($a); $donor->summary = strip_tags(trim($text_arr[2])); return $donor; }
protected function import($url) { $company = null; if (!$this->browser->get($url)->responseIsError()) { $text = $this->browser->getResponseText(); $rank = null; $name = null; $industryName = null; $street1 = null; $street2 = null; $city = null; $state = null; $postal = null; $phone = null; $fax = null; $website = null; $blurb = null; $summary = null; $revenue = null; $employees = null; $ceoName = null; $ceoBirthYear = null; //get rank if ($this->year > 1999 && $this->year < 2005 && preg_match('/ForbesListRank" content="(\\d+)"/i', $text, $match)) { $rank = $match[1]; } elseif ($this->year < 2000 && preg_match('/td class="highlightcolor1">(\\d+)/i', $text, $match)) { $rank = $match[1]; } elseif ($this->year > 2004 && preg_match('/<b>#(\\d+) ([^<]+)<\\/b>/i', $text, $match)) { $rank = html_entity_decode($match[1]); } //get name if ($this->year > 1995 && $this->year < 2005 && preg_match('/span class="mainlisttitle">([^<]+)<\\/span>/i', $text, $match)) { $name = html_entity_decode($match[1]); } elseif ($this->year > 2004 && preg_match('/<b>#(\\d+) ([^<]+)<\\/b>/i', $text, $match)) { $name = html_entity_decode($match[2]); } else { $this->printDebug("Company name not found"); return; } //get industry if ($this->year > 1995 && $this->year < 2001 && preg_match('/<b>See more private companies in <a [^>]+>([^<]+)<\\/a><\\/b>/ism', $text, $match)) { $industryName = trim(html_entity_decode($match[1])); } elseif ($this->year > 2000 && $this->year < 2005 && preg_match('/private companies\\<\\/a> in ([^\\.]+)/ism', $text, $match)) { $industryName = trim(html_entity_decode($match[1])); } elseif ($this->year > 2004 && preg_match('/<b>Industry:<\\/b> <a href="[^"]+">([^<]+)<\\/a>/ism', $text, $match)) { $industryName = trim(html_entity_decode($match[1])); } //get address if ($this->year > 1995 && $this->year < 2000 && preg_match('/<td class="mainlisttxt"\\>(.+)phone/smU', $text, $match)) { $contactLines = explode('<br>', trim($match[1])); array_pop($contactLines); $street1 = $contactLines[0]; $street2 = count($contactLines) == 3 ? $contactLines[2] : null; $city_state_zip = count($contactLines) == 3 ? LsLanguage::parseCityStatePostal($contactLines[2]) : LsLanguage::parseCityStatePostal($contactLines[1]); $city = $city_state_zip['city']; $state = $city_state_zip['state']; $postal = $city_state_zip['zip']; } elseif ($this->year > 1999 && $this->year < 2005 && preg_match('/(view private companies under this industry|in the same industry).+<br><br>(.+)phone/is', $text, $match)) { var_dump($match); $contactLines = explode('<br>', trim($match[1])); array_pop($contactLines); $street1 = $contactLines[0]; $street2 = count($contactLines) == 3 ? $contactLines[2] : null; $city_state_zip = count($contactLines) == 3 ? LsLanguage::parseCityStatePostal($contactLines[2]) : LsLanguage::parseCityStatePostal($contactLines[1]); $city = $city_state_zip['city']; $state = $city_state_zip['state']; $postal = $city_state_zip['zip']; } elseif ($this->year > 2004 && preg_match('/<div class="spaced">(.+)<\\/div>/ismU', $text, $match)) { $contactLines = explode('<br>', $match[1]); if (!preg_match('/Phone\\:|Fax\\:/i', $contactLines[0]) && !preg_match('/Phone\\:|Fax\\:/i', $contactLines[1])) { $street1 = trim($contactLines[0]); if (count($contactLines) == 4) { if (preg_match('/^(.+?) ([A-Z]{2}) (\\d{5})($|-)/sU', trim($contactLines[1]), $match)) { $city = $match[1]; $state = $match[2]; $postal = $match[3]; } } elseif (count($contactLines) == 5) { $street2 = $contactLines[1]; if (preg_match('/^(.+?) ([A-Z]{2}) (\\d{5})($|-)/sU', trim($contactLines[2]), $match)) { $city = $match[1]; $state = $match[2]; $postal = $match[3]; } } } } //get phone if ($this->year > 1995 && $this->year < 2005 && preg_match('/phone ([\\d\\-]{12})/is', $text, $match)) { $phone = trim(str_replace('-', '', $match[1])); } elseif ($this->year > 2004 && preg_match('/Phone: ([\\d\\-]{12})/is', $text, $match)) { $phone = trim(str_replace('-', '', $match[1])); } //get fax if ($this->year > 1995 && $this->year < 2005 && preg_match('/fax ([\\d\\-]{12})/is', $text, $match)) { $fax = trim(str_replace('-', '', $match[1])); } else { if ($this->year > 2004 && preg_match('/Fax: ([\\d\\-]{12})/is', $text, $match)) { $fax = trim(str_replace('-', '', $match[1])); } } //get website if ($this->year > 1995 && $this->year < 2005 && preg_match('/this company\'s web site[^>]+\\>(http[^\\<]+)/is', $text, $match)) { $website = $match[1]; } elseif ($this->year > 2004 && preg_match('/<div class="spaced">.*<\\/div>\\s+<br>\\s+<a href="(http:\\/\\/[^"]+)">/ismU', $text, $match)) { $website = $match[1]; } //get ceo if ($this->year > 1995 && $this->year < 2005 && preg_match('/b>CEO: ([^<]+)<\\/b>/ism', $text, $match)) { $ceoName = $match[1]; } elseif ($this->year > 2004 && preg_match('/CEO: ([^<]+)<\\/b> , (\\d+) <br>/ism', $text, $match)) { $ceoName = html_entity_decode($match[1]); $ceoBirthYear = date("Y"); -$match[2]; } //get summary if ($this->year > 1995 && $this->year < 2000 && preg_match_all('/p class="mainlisttxt">(.*)<\\/p>/ismU', $text, $match)) { $summary = str_replace(array(' ', "\n"), array(' ', ' '), html_entity_decode(trim(strip_tags($match[1][1])))); } elseif ($this->year > 1999 && $this->year < 2005 && preg_match('/p class="mainlisttxt">(.*)<\\/p>/ismU', $text, $match)) { $summary = str_replace(array(' ', "\n"), array(' ', ' '), html_entity_decode(trim(strip_tags($match[1])))); } elseif ($this->year > 2004 && preg_match('/<blockquote class="spaced">(.*)<\\/blockquote>/ismU', $text, $match)) { $summary = str_replace(array(' ', "\n"), array(' ', ' '), html_entity_decode(trim(strip_tags($match[1])))); } //get revenue if ($this->year > 1995 && $this->year < 2000 && preg_match('/<td class="mainlisttxt">\\$([\\S]+) mil<sup>e?<\\/sup><\\/td>/ismU', $text, $match)) { $this->printDebug($match[1]); $revenue = str_replace(",", "", $match[1] . ",000,000"); } elseif ($this->year > 1999 && $this->year < 2005 && preg_match('/<td class="mainlisttxt" nowrap>([^<]+)<sup>e?<\\/sup><\\/td>/ismU', $text, $match)) { $this->printDebug($match[1]); $revenue = str_replace(",", "", $match[1] . ",000,000"); } elseif ($this->year > 2004 && preg_match('/<td class="highlight" nowrap="nowrap">\\$([\\S]+) bil.*<\\/td> <td class="highlight" nowrap="nowrap">[^<]+<\\/td> <td class="highlight" nowrap="nowrap">([^<]+)<\\/td>/ismU', $text, $match)) { $revenue = 1000000000 * $match[1]; } //get employees if ($this->year > 1995 && $this->year < 2005 && preg_match('/mil<\\/td>.+<td class="mainlisttxt"( nowrap)?>(\\d[^<]+)<\\/td>.+<td class="mainlisttxt">[a-zA-Z]+<\\/td>/ismU', $text, $match)) { $employees = str_replace(',', '', $match[2]); } elseif ($this->year > 1999 && $this->year < 2005 && preg_match('/<sup>e?<\\/sup><\\/td> <td class="mainlisttxt"( nowrap)?>(\\d[^<]+)<sup>e?<\\/sup><\\/td> <td class="mainlisttxt">[a-zA-Z]+<\\/td>/ismU', $text, $match)) { $employees = str_replace(',', '', $match[2]); } elseif ($this->year > 2004 && preg_match('/<td class="highlight" nowrap="nowrap">([\\d,]+)<\\/td> <td class="highlight" nowrap="nowrap">[A-Z][a-z]{2,}<\\/td>/', $text, $match)) { $employees = str_replace(',', '', $match[1]); } /*$this->printDebug( "URL: ". $url); $this->printDebug( "Rank: " . $rank ); $this->printDebug( "Name: " . $name ); $this->printDebug( "Industry: " . $industryName ); $this->printDebug( "Street: " . $street1 ); $this->printDebug( "Street2: " . $street2 ); $this->printDebug( "City: " . $city ); $this->printDebug( "State: " . $state ); $this->printDebug( "Postal: " . $postal ); $this->printDebug( "Phone: " . $phone ); $this->printDebug( "Fax: " . $fax ); $this->printDebug( "Website: " . $website ); $this->printDebug( "CEO: " . $ceoName . " " . $ceoBirthYear); $this->printDebug( "Summary: " . $summary ); $this->printDebug( "Revenue: " . $revenue ); $this->printDebug( "Employees: " . $employees );*/ $search_company_name = trim(implode(' ', array_diff(explode(' ', ucwords(strtolower($name))), array_merge(LsLanguage::$business, LsLanguage::$businessAbbreviations)))); //continue; $this->printDebug("{$search_company_name} == {$name}"); if ($company = EntityTable::getByExtensionQuery(array('Org', 'PrivateCompany'))->addWhere("LOWER(REPLACE( org.name, '-' , '')) = ?", strtolower($name))->fetchOne()) { $this->printDebug("Company exists"); $company->revenue = $revenue; $company->save(); } else { $this->printDebug("Creating new company {$name}"); Doctrine::getTable('ExtensionDefinition')->clear(); $company = new Entity(); $company->addExtension('Org'); $company->addExtension('Business'); $company->addExtension('PrivateCompany'); $company->name = LsLanguage::titleize($name); $company->employees = strlen($employees) ? $employees : null; $company->revenue = strlen($revenue) ? $revenue : null; $company->website = strlen($website) ? $website : null; $company->summary = strlen($summary) ? trim($summary) : null; //add address if ($phone) { $company->addPhone($phone); } if ($fax) { //$company->addPhone($fax); } if ($city && $state) { $address = new Address(); $address->street1 = strlen($street1) ? $street1 : null; $address->street2 = strlen($street2) ? $street2 : null; $address->city = strlen($city) ? $city : null; if ($state = AddressStateTable::retrieveByText($state)) { $address->State = $state; } $address->postal = $postal; $company->addAddress($address); $address->save(); $address->addReference($source = $url, $excerpt = null, $fields = array('city', 'country_id', 'postal', 'state_id', 'street1'), $name = 'Forbes.com', $detail = null, $date = null); } } /*$this->printDebug( "URL: ". $url); $this->printDebug( "Rank: " . $rank ); $this->printDebug( "Name: " . $name ); $this->printDebug( "Industry: " . $industryName ); $this->printDebug( "Street: " . $street1 ); $this->printDebug( "Street2: " . $street2 ); $this->printDebug( "City: " . $city ); $this->printDebug( "State: " . $state ); $this->printDebug( "Postal: " . $postal ); $this->printDebug( "Phone: " . $phone ); $this->printDebug( "Fax: " . $fax ); $this->printDebug( "Website: " . $website ); $this->printDebug( "CEO: " . $ceoName . " " . $ceoBirthYear); $this->printDebug( "Summary: " . $summary ); $this->printDebug( "Revenue: " . $revenue ); $this->printDebug( "Employees: " . $employees );*/ $company->save(); $company->addReference($source = $url, $excerpt = null, $fields = array('website', 'name', 'website', 'summary', 'revenue', 'employees'), $name = 'Forbes.com', $detail = null, $date = null); $this->saveToList($company, $rank); } else { $this->printDebug("Couldn't get company: " . $url); } }