function retrieve() { global $html; global $url; $no = 0; foreach ($html->find('#gallery a[!title]') as $e) { $no++; saveIt($url, $e->href, $no); } }
# START #<h3> #2.183 million tonnes CO #<sub style="font-size: 60%;">2</sub> #e #</h3> # END $pattern = "/<h3\\b[^>]*>([0-9]+)\\.?([0-9]?)/i"; #$pattern = "/(<h3\b[^>]*)>/i"; $subject = $HTMLChunk; if (preg_match($pattern, $subject, $matches)) { echo "A match was found."; print "\n" . $matches[0] . "\n"; return $matches[0]; } } function saveIt($c02E_YTD, $c02E_thisWeek, $state) { // save the record in the db $record = array('year' => date('Y'), 'state' => $state, 'week' => date('W'), 'date' => date('j/m/Y'), 'c02E_this_week' => $c02E_thisWeek, 'c02E_year_to_date' => $c02E_YTD); scraperwiki::save(array('state', 'week'), $record); } foreach ($stateCode as $key => $value) { // fetch page $page = fetchPage($value); // get tonnes of carbon this year to date $carbontonnes_ytd = parsePage_thisYearToDate($page); $carbontonnes_thisWeek = parsePage_thisWeek($page); // save the data saveIt($carbontonnes_ytd, $carbontonnes_thisWeek, $key); }
// All other pages upto Last Page ($last_page) $last_page = 500; for ($page = 1; $page <= $last_page; $page++) { $url = $root_url . "/inspirational-quotes?page=" . $page; $html = file_get_html($url); foreach ($html->find('.index_card') as $card) { $quote = $card->find('.quotation', 0)->innertext; $author = $card->find('.quotation_author', 0)->plaintext; $quote = cleanQuotes($quote); $author = cleanAuthor($author); //echo "<br>".$quote."<br>"; //echo '-'.$author."<hr>"; try { saveIt($quote, $author); } catch (Exception $e) { saveIt("{$e}", "NOTHING"); } } $html->clear(); unset($html); } function cleanQuotes($str) { $str = str_replace("“", "", $str); $str = str_replace("”", "", $str); return $str; } function cleanAuthor($a) { $a = str_replace(" Share or Discuss This Quote", "", $a); $a = trim($a);
//To retrieve Last Page $html = file_get_html($root_url); $last = $html->find('li.pager-last a', 0); $subject = $last->href; $pattern = '/[0-9]+/'; preg_match($pattern, $subject, $matches); $last_page = $matches[0]; //echo "<h2>last_page</h2>".$last_page; // Print all jokes on Home-Page //echo "<h2>HomePage </h2>"; foreach ($html->find('.item-list ul li a.createYourOwn') as $joke) { saveIt($joke->innertext); } // All other pages upto Last Page ($last_page) for ($page = 1; $page <= $last_page; $page++) { $html->clear(); unset($html); $url = $root_url . "/all-chuck-norris-facts?page=" . $page; $html = file_get_html($url); //echo "<h2>Page ".($page+1)."</h2>"; foreach ($html->find('.item-list ul li a.createYourOwn') as $joke) { saveIt($joke->innertext); } } function saveIt($txt) { global $joke_count; $record = array('JOKE_ID' => ++$joke_count, 'JOKE_TEXT' => $txt); scraperwiki::save(array('JOKE_ID'), $record); //var_dump($record); }