private function getProxyData($roster, $url, $proxy_year) { echo "fetching data from proxy at {$url} \n\n"; $people_count = 0; if (!$this->browser->get($url)->responseIsError()) { $this->proxyText = $this->browser->getResponseText(); $this->proxyText = LsHtml::replaceEntities($this->proxyText, ENT_QUOTES, 'UTF-8'); $this->proxyText = LsString::utf8TransUnaccent($this->proxyText); foreach ($roster as &$r) { //make sure this is not form 4 data for a corporation, continue to the next if it is if ($r['officerTitle'] == '' && $r['isDirector'] != 1 && strtoupper($r['isDirector']) != strtoupper('true')) { continue; } //echo $re; $parts = preg_split("/[\\s|\\.]+/", $r['personName'], -1, PREG_SPLIT_NO_EMPTY); //first word, but has to be part of last name because form4 names are in format RUBIN ROBERT E $last = trim($parts[0]); //sometimes O'LEARY can appear as O LEARY in the form 4 if (strlen($last) == 1) { $r['personName'] = $last . substr($r['personName'], 2); $parts = preg_split("/[\\s|\\.]+/", $r['personName'], -1, PREG_SPLIT_NO_EMPTY); $last = trim($parts[0]); } //prepare regex to match occurrences of full name //case insensitive to accommodate for various irregularities in names $re = LsLanguage::buildLooseNameRegex($r['personName']); $offset = 0; $found = true; //use stripos (much faster than regex) to find occurrences of the first word in the form 4 name (assumed to be part of the last name) //needs to be case insensitive //continue searching for last name in proxy until a matching full name (proxyName) is found while (!isset($r['proxyName']) && $found !== false) { $found = stripos($this->proxyText, $last, $offset); //$this->printDebug('found at pos:' . $found); $offset = $found + 1; if ($found !== false) { $str = substr($this->proxyText, $found - 70, 120); //$this->printDebug('found string: ' . $str); //$this->printDebug($re); preg_match_all($re, $str, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER); //$this->printDebug('matchcount is ' . count($matches)); foreach ($matches as $match) { if (stristr($match[1][0], '=')) { continue; } //since we may or may not be working with the full last name, use getLastName to return full last name $new_last = $this->getLastName($r['personName'], $match[1][0]); if ($new_last) { //if last name produced by case insensitive search has no capital letters, not a match if (preg_match('/\\p{Lu}/su', $new_last) == 0) { continue; } //now that we have a last name, pull the full name from the string $name = LsLanguage::getNameWithLast($match[0][0], $new_last); if ($name) { $parts = preg_split('/\\s+/isu', $name['nameStart'], -1, PREG_SPLIT_NO_EMPTY); $non_prefixes = array_diff($parts, PersonTable::$nameParsePrefixes); //if all we've found are matching prefixes, not a match if (count($non_prefixes) == 0) { continue; } else { $name1_parts = preg_split('/\\s+/', $r['personName'], -1, PREG_SPLIT_NO_EMPTY); $ct = 0; //compatibility check to correct for vagueness of regex foreach ($non_prefixes as $n) { foreach ($name1_parts as $p) { if (stripos($n, $p) === 0 || stripos($p, $n) === 0) { $ct++; } } } //phew -- if name is (somewhat) compatible, assume we've found it if ($ct > 0) { $r['proxyUrl'] = $url; $r['proxyYear'] = $proxy_year; $r['nameLast'] = trim(LsString::spacesToSpace($name['nameLast'])); $r['proxyName'] = trim(LsString::spacesToSpace($name['nameFull'])); } } } } } } } } unset($r); } else { //Error response (eg. 404, 500, etc) $log = fopen($this->logFile, 'a'); fwrite($log, "Couldn't get " . $url . "\n"); fclose($log); } return $roster; }