public static function scrape($lang = 'en') { $logger = self::getLogger('Starting offline scrape...'); $langPathPart = self::langPathPartLookup($lang); $ridingPaths = parent::getRidingPaths(self::JURISDICTION_SHORTHAND, $langPathPart); $ridingCount = count($ridingPaths); self::addLog("{$ridingCount} ridings found"); foreach ($ridingPaths as $i => $path) { $sourceUrl = self::generateSource($lang, $i); self::addLog("Getting results for riding {$i} {$path}..."); $riding = new Riding(); $riding->setSource($sourceUrl); $html = file_get_contents($path); $doc = new \DOMDocument(); self::setErrorHandler(); $doc->loadHTML($html); self::setErrorHandler(TRUE); $xpath = new \DOMXPath($doc); $xPathQuery = '//*[@id="grdResultsucElectoralDistrictResult' . $i . '"]/caption'; self::addLog("xPath: {$xPathQuery}"); $ridingNode = $xpath->query($xPathQuery); $ridingName = trim(substr($ridingNode->item(0)->textContent, 50)); self::addLog($ridingName); $riding->setName(utf8_decode($ridingName)); $tables = $doc->getElementsByTagName('table'); // nodes = $xpath->query('/html/body/div[2]/div[2]/div[2]/div[3]/table/tbody'); // cho "For " . $ridingNames[$i] . "\n"; // ar_export($tables->item(0)->textContent); // cho "\n\n"; $tablesLength = $tables->length; self::addLog("Found {$tablesLength} items in \$tables"); $rows = $tables->item(0)->getElementsByTagName('tr'); $numRows = $rows->length; self::addLog("There are {$numRows} rows\n"); $j = 0; for ($j = 0; $j < $numRows - 1; $j++) { if ($j == 0) { continue; } $row = $rows->item($j); $cells = $row->getElementsByTagName('td'); $party = $cells->item(0)->textContent; if (strpos($party, ':') !== FALSE) { continue; } $votes = preg_replace("/[^0-9]/", "", $cells->item(2)->textContent); $votes = str_replace(",", "", $votes); self::addLog("Scrapped: {$party}\t{$votes}\n"); $riding->setVotes($party, $votes); } $xPathQuery = '//*[@id="divElectorNumberucElectoralDistrictResult' . $i . '"]/p'; $numVoters = trim(substr($xpath->query($xPathQuery)->item(0)->textContent, 80)); $numVoters = str_replace(',', '', $numVoters); self::addLog("Number of voters: {$numVoters}"); $riding->setEligibleVoters($numVoters); $row = $rows->item($numRows - 1); $cells = $row->getElementsByTagName('td'); $totalVotes = $cells->item(2)->textContent; $totalVotes = str_replace(',', '', $totalVotes); self::addLog("Number of total votes: {$totalVotes}"); $riding->setAllRidingVotes($totalVotes); $riding->updateTallies(); } }
public static function scrape() { $logger = self::getLogger('Starting scrape...'); $ridingIdentfiers = self::getRidingIdentifiers(); self::addLog('Got ' . count($ridingIdentfiers) . ' ridings: ' . join(', ', $ridingIdentfiers)); foreach ($ridingIdentfiers as $i) { $url = self::getFinalPath($i); self::addLog("Getting results for riding {$i} {$url}..."); $riding = new Riding(); $riding->setSource($url); $ridingCount = count(Riding::getAllRidings()); self::addLog("Riding count is {$ridingCount}"); $html = @file_get_contents($url); if ($html === FALSE) { self::addLog("Warning: no content for riding {$i} at {$url}"); continue; } /* $doc = new \DOMDocument (); self::setErrorHandler(); $doc->loadHTML ( $html ); self::setErrorHandler(TRUE); $xpath = new \DOMXPath ( $doc ); */ $string = self::grep($html, 'Unofficial Poll Results', TRUE); $ridingName = self::grep($string, 'Unofficial Poll Results - [0-9][0-9]* ([\\. A-Z-]*)')[0]; $ridingName = substr_replace($ridingName, '', -2); self::addLog("Got ridingName: {$ridingName}"); $riding->setName($ridingName); // nodes = $xpath->query('/html/body/div[2]/div[2]/div[2]/div[3]/table/tbody'); // cho "For " . $ridingNames[$i] . "\n"; // ar_export($tables->item(0)->textContent); // cho "\n\n"; $string = self::grep($html, 'CHeadCA', TRUE); preg_match_all("|<DIV CLASS=CHPA>([A-Z]*)</DIV>|", $string[0], $matches); self::addLog('Got for party: ' . count($matches[1]) . ' matches: ' . join(',', $matches[1])); $stringVotes = self::grep($html, 'ColFooter', TRUE); preg_match_all("|<TD Class=ColFooter ALIGN=RIGHT VALIGN=TOP>([ 0-9,]*)<BR>|", $stringVotes[2], $matchesVotes); self::addLog('Got for votes: ' . count($matchesVotes[1]) . ' matches: ' . join(' ', $matchesVotes[1])); if (count($matches == 0)) { self::addError("No matches found for party"); } foreach ($matches[1] as $index => $party) { $votes = str_replace(',', '', $matchesVotes[1][$index]); if (empty($votes)) { self::addError("No votes found for {$party} in {$ridingName}"); } self::addLog("Setting {$votes} votes for {$party} in {$ridingName}"); $riding->setVotes($party, $votes); } # Find and set eligeable voters $stringVotes = self::grep($html, 'ColFooter', TRUE); preg_match_all("|<TD Class=ColFooter ALIGN=RIGHT VALIGN=TOP>([0-9,]*)|", $stringVotes[1], $matchesVotes); self::addLog('Got for voters count ' . count($matchesVotes[1]) . ' matches: ' . join(' ', $matchesVotes[1])); $numVoters = str_replace(',', '', $matchesVotes[1][0]); self::addLog("Number of voters: {$numVoters}"); $riding->setEligibleVoters($numVoters); # Find and save all votes (aka total votes) $stringVotes = self::grep($html, 'ColFooter', TRUE); preg_match_all("|<TD Class=ColFooter ALIGN=RIGHT VALIGN=TOP>([ 0-9,]*)</TABLE>|", $stringVotes[5], $matchesVotes); self::addLog('Got ' . count($matchesVotes[1]) . ' matches: ' . join(' ', $matchesVotes[1])); $totalVotes = str_replace(',', '', $matchesVotes[1][0]); self::addLog("Number of total votes: {$totalVotes}"); $riding->setAllRidingVotes($totalVotes); # Update talies $riding->updateTallies(); } }