set_time_limit(0); //抓取 $r = array(); foreach ($C->RECORD_ENGINE as $key => $val) { if (empty($val)) { continue; } foreach ($C->RECORD_URLS as $site => $urls) { if (is_array($urls) && count($urls) == 0) { continue; } foreach ($urls as $k => $v) { $url = str_replace('*****', $v, $val); $cnt = 0; $tmp = FALSE; while ($cnt < 3 && ($tmp = @get_sources($url)) === FALSE) { $cnt++; sleep(4); } if (!$tmp) { $r[$key][$site][$v] = 0; } else { $r[$key][$site][$v] = parse_html($tmp); //$r[引擎类型][主站][搜索url] } ob_flush(); flush(); echo $v . " ======== " . $r[$key][$site][$v] . "\n"; sleep(3); //抓取间隔5s }
} /* * Stole this function from here: http://www.php.net/manual/en/function.json-decode.php * Thanks to Colin Mollenhour * Renamed to just clean the json but not decode it */ function json_clean($json) { $json = str_replace(array("\n", "\r"), "", $json); $json = preg_replace('/([{,])(\\s*)([^"]+?)\\s*:/', '$1"$3":', $json); return $json; } //require 'scraperwiki.php'; require 'scraperwiki/simple_html_dom.php'; $directory_url = "http://schools.nyc.gov/Home/InOurSchoolsToday/2012-2013/cancellations"; $records = get_sources($directory_url); # Our record counter $count = 1; # To see where the errors are when attempting # to deserialize using PHP's miserable excuse # for a JSON parser, let's break the JSON # into single lines and process each one. foreach (explode("{ impacted: ", $records) as $line) { #foreach ($records as $record) { # Skip first record, it's just "[{ impacted: " if ($count == 1) { $count++; continue; } # Add the "impacted" back in, and trim the end $format = "{ \"impacted\": %s";
{ $value = empty($value) ? null : $value; } //require 'scraperwiki.php'; require 'scraperwiki/simple_html_dom.php'; /// -------------------------------- First download the file -------------------------------- $url = 'http://www.usa.gov/About/developer_resources/allfaqs.xml'; $xml_file_path = '/tmp/allfaqs.xml'; //$xml_file_path = '/Users/philipashlock/Sites/test.dev/scraper/faq-data/allfaqs.xml'; $ch = curl_init($url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); $data = curl_exec($ch); curl_close($ch); file_put_contents($xml_file_path, $data); /// ------------------------------------------------------------------------------------------ $records = get_sources($xml_file_path); function get_sources($xml_file_path) { // Specify configuration $config = array('indent' => true, 'output-xhtml' => false, 'output-html' => true, 'show-warnings' => false, 'show-body-only' => true, 'wrap' => 200); $count = 1; $XMLReader = new XMLReader(); $XMLReader->open($xml_file_path); // Move to the first "[item name]" node in the file. while ($XMLReader->read() && $XMLReader->name !== "Row") { } // Now that we're at the right depth, hop to the next "[item name]" until the end of tree/file. while ($XMLReader->name === "Row") { if ($count > 1) { $dom = new simple_html_dom(); $dom->load($XMLReader->readOuterXML());
} else { return false; } } $run_environment = 'prod'; // either 'dev' or 'prod' $max_records = 20; // only used for testing if ($run_environment == 'dev') { error_reporting(E_ALL); ini_set('display_errors', 'On'); require 'scraperwiki.php'; } require 'scraperwiki/simple_html_dom.php'; $url = 'http://www.state.nj.us/dca/home/20120809mayors.csv'; $output = get_sources($url); if ($run_environment == 'dev') { header('Content-type: application/json'); print json_encode($output); } function get_sources($url) { global $run_environment; global $max_records; $csv = scraperWiki::scrape($url); $lines = explode("\n", $csv); $count = 1; // Each Row foreach ($lines as $line) { $line = str_getcsv($line); // Reset rep array
$page_content = "links_page.inc.php"; include _TRACK_SHOW_COMMON_PATH . "/templates/main.inc.php"; exit; break; case 'rules': $arr_offers = get_rules_offers(); list($js_last_offer_id, $js_offers_data) = get_offers_data_js($arr_offers); $js_sources_data = get_sources_data_js(); $js_countries_data = get_countries_data_js(); $js_langs_data = get_langs_data_js(); $page_content = 'rules_page.inc.php'; include _TRACK_SHOW_COMMON_PATH . "/templates/main.inc.php"; exit; break; case 'costs': $arr_sources = get_sources(); $arr_campaigns = get_campaigns(); $arr_ads = get_ads(); $page_content = 'costs_page.inc.php'; include _TRACK_SHOW_COMMON_PATH . "/templates/main.inc.php"; exit; break; case 'import': $page_content = 'import_page.inc.php'; include _TRACK_SHOW_COMMON_PATH . "/templates/main.inc.php"; exit; break; case 'adnets': $page_content = 'adnets_page.inc.php'; include _TRACK_SHOW_COMMON_PATH . "/templates/main.inc.php"; exit;
} } echo '7777777777'; if ($type == 1) { //twitter $start = isset($_GET['start']) ? $_GET['start'] : 0; $offset = isset($_GET['offset']) ? $_GET['offset'] : 100; // $cron_source = isset($_GET['cron_source']) ? $_GET['cron_source']: 'gae'; // $time1 = time(); $time1 = date("h:i:s"); //set_time_limit(60); $fp = fopen('twitter_1.txt', 'a+'); fwrite($fp, 's(' . $start . '):' . $time1 . "\n"); fclose($fp); //get_sources($cat_id, $connection, $start, $offset); get_sources('', $connection, $start, $offset); // $time2 = time(); $time2 = date("h:i:s"); $fp = fopen('twitter_1.txt', 'a+'); fwrite($fp, 'e(' . $start . '):' . $time2 . "\n"); fclose($fp); // cron_start_end($time1, $time2, $cron_source); } else { if ($type == 2) { //rss $start = isset($_GET['start']) ? $_GET['start'] : 0; $offset = isset($_GET['offset']) ? $_GET['offset'] : 100; // $cron_source = 'rss_source'; // $time1 = time(); $time1 = date("h:i:s"); //set_time_limit(60);
// faking this now, real location involves a more complicated POST request to: // http://www.orcities.org/CityResources/LOCCityDirectory/CityReportViewer/tabid/6214/language/en-US/Default.aspx $url = 'http://dropbox.ashlock.us/opengov/democracymap/cities_or_data.csv'; if ($run_environment == 'prod') { $ch = curl_init($url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_ENCODING, 'UTF-8'); $data = curl_exec($ch); curl_close($ch); // solving for weird encoding issues // $data = iconv('utf-16le', 'utf-8', $data); file_put_contents($csv_file_path, $data); unset($data); } /// ------------------------------------------------------------------------------------------ $output = get_sources($csv_file_path); if ($run_environment == 'dev') { header('Content-type: application/json'); print json_encode($output); } function get_sources($csv_file_path) { global $run_environment; global $max_records; if (($handle = fopen($csv_file_path, 'r')) !== false) { $count = 1; $cityrow_count = 1; $city = array(); // loop through the file line-by-line while (($data = fgetcsv($handle)) !== false) { // Name of the city