Пример #1
0
set_time_limit(0);
//抓取
$r = array();
foreach ($C->RECORD_ENGINE as $key => $val) {
    if (empty($val)) {
        continue;
    }
    foreach ($C->RECORD_URLS as $site => $urls) {
        if (is_array($urls) && count($urls) == 0) {
            continue;
        }
        foreach ($urls as $k => $v) {
            $url = str_replace('*****', $v, $val);
            $cnt = 0;
            $tmp = FALSE;
            while ($cnt < 3 && ($tmp = @get_sources($url)) === FALSE) {
                $cnt++;
                sleep(4);
            }
            if (!$tmp) {
                $r[$key][$site][$v] = 0;
            } else {
                $r[$key][$site][$v] = parse_html($tmp);
                //$r[引擎类型][主站][搜索url]
            }
            ob_flush();
            flush();
            echo $v . " ======== " . $r[$key][$site][$v] . "\n";
            sleep(3);
            //抓取间隔5s
        }
}
/*
* Stole this function from here:  http://www.php.net/manual/en/function.json-decode.php 
* Thanks to Colin Mollenhour
* Renamed to just clean the json but not decode it
*/
function json_clean($json)
{
    $json = str_replace(array("\n", "\r"), "", $json);
    $json = preg_replace('/([{,])(\\s*)([^"]+?)\\s*:/', '$1"$3":', $json);
    return $json;
}
//require 'scraperwiki.php';
require 'scraperwiki/simple_html_dom.php';
$directory_url = "http://schools.nyc.gov/Home/InOurSchoolsToday/2012-2013/cancellations";
$records = get_sources($directory_url);
# Our record counter
$count = 1;
# To see where the errors are when attempting
# to deserialize using PHP's miserable excuse
# for a JSON parser, let's break the JSON
# into single lines and process each one.
foreach (explode("{ impacted: ", $records) as $line) {
    #foreach ($records as $record) {
    # Skip first record, it's just "[{ impacted: "
    if ($count == 1) {
        $count++;
        continue;
    }
    # Add the "impacted" back in, and trim the end
    $format = "{ \"impacted\": %s";
{
    $value = empty($value) ? null : $value;
}
//require 'scraperwiki.php';
require 'scraperwiki/simple_html_dom.php';
/// -------------------------------- First download the file --------------------------------
$url = 'http://www.usa.gov/About/developer_resources/allfaqs.xml';
$xml_file_path = '/tmp/allfaqs.xml';
//$xml_file_path = '/Users/philipashlock/Sites/test.dev/scraper/faq-data/allfaqs.xml';
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$data = curl_exec($ch);
curl_close($ch);
file_put_contents($xml_file_path, $data);
/// ------------------------------------------------------------------------------------------
$records = get_sources($xml_file_path);
function get_sources($xml_file_path)
{
    // Specify configuration
    $config = array('indent' => true, 'output-xhtml' => false, 'output-html' => true, 'show-warnings' => false, 'show-body-only' => true, 'wrap' => 200);
    $count = 1;
    $XMLReader = new XMLReader();
    $XMLReader->open($xml_file_path);
    // Move to the first "[item name]" node in the file.
    while ($XMLReader->read() && $XMLReader->name !== "Row") {
    }
    // Now that we're at the right depth, hop to the next "[item name]" until the end of tree/file.
    while ($XMLReader->name === "Row") {
        if ($count > 1) {
            $dom = new simple_html_dom();
            $dom->load($XMLReader->readOuterXML());
    } else {
        return false;
    }
}
$run_environment = 'prod';
// either 'dev' or 'prod'
$max_records = 20;
// only used for testing
if ($run_environment == 'dev') {
    error_reporting(E_ALL);
    ini_set('display_errors', 'On');
    require 'scraperwiki.php';
}
require 'scraperwiki/simple_html_dom.php';
$url = 'http://www.state.nj.us/dca/home/20120809mayors.csv';
$output = get_sources($url);
if ($run_environment == 'dev') {
    header('Content-type: application/json');
    print json_encode($output);
}
function get_sources($url)
{
    global $run_environment;
    global $max_records;
    $csv = scraperWiki::scrape($url);
    $lines = explode("\n", $csv);
    $count = 1;
    // Each Row
    foreach ($lines as $line) {
        $line = str_getcsv($line);
        // Reset rep array
Пример #5
0
     $page_content = "links_page.inc.php";
     include _TRACK_SHOW_COMMON_PATH . "/templates/main.inc.php";
     exit;
     break;
 case 'rules':
     $arr_offers = get_rules_offers();
     list($js_last_offer_id, $js_offers_data) = get_offers_data_js($arr_offers);
     $js_sources_data = get_sources_data_js();
     $js_countries_data = get_countries_data_js();
     $js_langs_data = get_langs_data_js();
     $page_content = 'rules_page.inc.php';
     include _TRACK_SHOW_COMMON_PATH . "/templates/main.inc.php";
     exit;
     break;
 case 'costs':
     $arr_sources = get_sources();
     $arr_campaigns = get_campaigns();
     $arr_ads = get_ads();
     $page_content = 'costs_page.inc.php';
     include _TRACK_SHOW_COMMON_PATH . "/templates/main.inc.php";
     exit;
     break;
 case 'import':
     $page_content = 'import_page.inc.php';
     include _TRACK_SHOW_COMMON_PATH . "/templates/main.inc.php";
     exit;
     break;
 case 'adnets':
     $page_content = 'adnets_page.inc.php';
     include _TRACK_SHOW_COMMON_PATH . "/templates/main.inc.php";
     exit;
    }
}
echo '7777777777';
if ($type == 1) {
    //twitter
    $start = isset($_GET['start']) ? $_GET['start'] : 0;
    $offset = isset($_GET['offset']) ? $_GET['offset'] : 100;
    // $cron_source = isset($_GET['cron_source']) ? $_GET['cron_source']: 'gae';
    // $time1 = time();
    $time1 = date("h:i:s");
    //set_time_limit(60);
    $fp = fopen('twitter_1.txt', 'a+');
    fwrite($fp, 's(' . $start . '):' . $time1 . "\n");
    fclose($fp);
    //get_sources($cat_id, $connection, $start, $offset);
    get_sources('', $connection, $start, $offset);
    //  $time2 = time();
    $time2 = date("h:i:s");
    $fp = fopen('twitter_1.txt', 'a+');
    fwrite($fp, 'e(' . $start . '):' . $time2 . "\n");
    fclose($fp);
    // cron_start_end($time1, $time2, $cron_source);
} else {
    if ($type == 2) {
        //rss
        $start = isset($_GET['start']) ? $_GET['start'] : 0;
        $offset = isset($_GET['offset']) ? $_GET['offset'] : 100;
        // $cron_source = 'rss_source';
        //  $time1 = time();
        $time1 = date("h:i:s");
        //set_time_limit(60);
// faking this now, real location involves a more complicated POST request to:
// http://www.orcities.org/CityResources/LOCCityDirectory/CityReportViewer/tabid/6214/language/en-US/Default.aspx
$url = 'http://dropbox.ashlock.us/opengov/democracymap/cities_or_data.csv';
if ($run_environment == 'prod') {
    $ch = curl_init($url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_ENCODING, 'UTF-8');
    $data = curl_exec($ch);
    curl_close($ch);
    // solving for weird encoding issues
    // $data = iconv('utf-16le', 'utf-8', $data);
    file_put_contents($csv_file_path, $data);
    unset($data);
}
/// ------------------------------------------------------------------------------------------
$output = get_sources($csv_file_path);
if ($run_environment == 'dev') {
    header('Content-type: application/json');
    print json_encode($output);
}
function get_sources($csv_file_path)
{
    global $run_environment;
    global $max_records;
    if (($handle = fopen($csv_file_path, 'r')) !== false) {
        $count = 1;
        $cityrow_count = 1;
        $city = array();
        // loop through the file line-by-line
        while (($data = fgetcsv($handle)) !== false) {
            // Name of the city