simple_html_dom::__destruct, vanilla PHP 코드 예제들

예제 #1

0

파일 보기

파일: dk-ted.php 프로젝트: flyeven/scraperwiki-scraper-vault

function scrapeTEDRSS($url, $sector)
{
    print $url . " " . $sector . "\n";
    // $xml = scraperWiki::scrape($url);
    $curl = curl_init($url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    // 10 second before aborting
    // try CURLOPT_CONNECTTIMEOUT (in seconds)
    // try CURLOPT_LOW_SPEED_LIMIT (to define what slow is, with):
    // curl_setopt($curl, CURLOPT_LOW_SPEED_TIME, 10); (10 second at low speed before aborting
    $xml = curl_exec($curl);
    print curl_error($curl) . "\n";
    $dom = new simple_html_dom();
    $dom->load($xml);
    $items = $dom->find("item");
    foreach ($items as $item) {
        $guid = $item->find("guid");
        $noticeURL = str_replace("TEXT", "DATA", $guid[0]->plaintext);
        print $noticeURL . " " . $sector . " " . memory_get_usage() / 1000000 . "MB";
        echo "\n";
        // $record = scrapeTEDDataPage ($noticeURL, $sector);
        $record = array('time' => microtime(true), 'sector' => $sector, 'url' => $noticeURL);
        scraperwiki::save(array('sector', 'url'), $record);
        sleep(1);
    }
    $dom->__destruct();
    unset($items);
    unset($dom);
    unset($xml);
    print memory_get_usage() / 1024 / 1024 . "MB\n";
}

예제 #2

0

파일 보기

파일: universaljobmatch.php 프로젝트: flyeven/scraperwiki-scraper-vault

function scrape_job_page($page)
{
    $page_html = scraperWiki::scrape("https://jobsearch.direct.gov.uk/JobSearch/PowerSearch.aspx?tm=0&pg=" . $page);
    $dom = new simple_html_dom();
    $dom->load($page_html);
    foreach ($dom->find("table tr") as $data) {
        $tds = $data->find("td");
        if (count($tds) == 5) {
            $id_hyperlink = $tds[0]->find('a[name]', 0);
            $id = intval($id_hyperlink->name);
            $more_info_hyperlink = $tds[2]->find('a', 0)->href;
            print $more_info_hyperlink;
            $record = array('id' => $id, 'posted_date' => date_create($tds[0]->plaintext), 'job_title' => trim($tds[2]->plaintext), 'company' => trim($tds[3]->plaintext), 'location' => trim($tds[4]->plaintext), 'url' => $more_info_hyperlink);
            //print json_encode($record) . "\n";
            scraperwiki::save(array('id'), $record);
        }
    }
    $dom->__destruct();
}

예제 #3

0

파일 보기

파일: nse_stocks.php 프로젝트: flyeven/scraperwiki-scraper-vault

                scraperwiki::save_sqlite(array("stock"), $record, "NSE_Stocks");
            }
        }
    }
    $dom->__destruct();
}
//scrapping html
require 'scraperwiki/simple_html_dom.php';
foreach (range('A', 'Z') as $char) {
    $dom = new simple_html_dom();
    for ($pageNum = 0; $pageNum <= 10; $pageNum++) {
        $html = scraperWiki::scrape("http://www.kotaksecurities.com/stock-market-news/equity/1024/pe-ratio-NSE-All-" . $char . "/" . $pageNum);
        if ($html == NULL) {
            continue;
        }
        $dom->load($html);
        //print ("CHAR:".$char);
        foreach ($dom->find('table[class="TableBG1"]') as $table) {
            foreach ($table->find('tr[class="tabbody"]') as $tr) {
                $stock = $tr->children(0)->plaintext;
                $close = $tr->children(1)->plaintext;
                $eps = $tr->children(2)->plaintext;
                $pe = $tr->children(3)->plaintext;
                $record = array('stock' => $stock, 'close' => $close, 'eps' => $eps, 'pe' => $pe);
                //print_r($record);
                scraperwiki::save_sqlite(array("stock"), $record, "NSE_Stocks");
            }
        }
    }
    $dom->__destruct();
}

예제 #4

0

파일 보기

파일: wikicfpscrapper.php 프로젝트: flyeven/scraperwiki-scraper-vault

function extractCFP($cat, $name, $link)
{
    print "   " . $name . " -- " . $link . "\n";
    if (alreadyKnown($cat, $name, $link)) {
        return false;
    }
    $html = scraperWiki::scrape("http://www.wikicfp.com/" . str_replace(" ", "%20", $link));
    $dom = new simple_html_dom();
    $dom->load($html);
    $spans = $dom->find("span");
    $type = "";
    $title = "";
    $link = "";
    $id = "";
    $description = "";
    $locality = "";
    $summaries = array();
    $startdates = array();
    $enddates = array();
    $sdate = "";
    $edate = "";
    $deadline = "";
    $notification = "";
    $finalversion = "";
    foreach ($spans as $span) {
        // print_r($span);
        if (isset($span->attr['property'])) {
            //   print("      ".$span->attr['property']."=".$span->attr['content']."\n");
            if (strcmp($span->attr['property'], "v:eventType") === 0) {
                $type = $span->attr['content'];
                print "      type = " . $type . "\n";
            }
            if (strcmp($span->attr['property'], "dc:title") === 0) {
                $title = $span->attr['content'];
                print "      title = " . $title . "\n";
            }
            if (strcmp($span->attr['property'], "dc:source") === 0) {
                $link = $span->attr['content'];
                print "      link = " . $link . "\n";
            }
            if (strcmp($span->attr['property'], "dc:identifier") === 0) {
                $id = $span->attr['content'];
                print "      id = " . $id . "\n";
            }
            if (strcmp($span->attr['property'], "dc:description") === 0) {
                $description = $span->attr['content'];
                print "      description = " . $description . "\n";
            }
            if (strcmp($span->attr['property'], "v:locality") === 0) {
                $locality = $span->attr['content'];
                print "      locality = " . $locality . "\n";
            }
            if (strcmp($span->attr['property'], "v:summary") === 0) {
                $summaries[] = $span->attr['content'];
            }
            if (strcmp($span->attr['property'], "v:startDate") === 0) {
                $startdates[] = $span->attr['content'];
            }
            if (strcmp($span->attr['property'], "v:endDate") === 0) {
                $enddates[] = $span->attr['content'];
            }
        }
        $dom->__destruct();
    }
    foreach ($summaries as $ind => $summary) {
        if (strcmp($summary, $name) === 0) {
            $sdate = $startdates[$ind];
            $edate = $enddates[$ind];
            print "       between " . $sdate . " and " . $edate . "\n";
        }
        if (strcmp($summary, "Submission Deadline") === 0) {
            $deadline = $startdates[$ind];
            print "       deadline = " . $deadline . "\n";
        }
        if (strcmp($summary, "Notification Due") === 0) {
            $notification = $startdates[$ind];
            print "       notification = " . $notification . "\n";
        }
        if (strcmp($summary, "Final Version Due") === 0) {
            $finalversion = $startdates[$ind];
            print "       finalversion = " . $finalversion . "\n";
        }
    }
    $record = array('id' => $id, 'category' => $cat, 'type' => $type, 'title' => $title, 'link' => $link, 'location' => $locality, 'description' => $description, 'startdate' => $sdate, 'enddate' => $edate, 'deadline' => $deadline, 'notification' => $notification, 'finalversion' => $finalversion);
    scraperwiki::save(array('ID', 'category'), $record);
    sleep(5);
    return true;
}

예제 #5

0

파일 보기

파일: city_representatives_-_florida_1.php 프로젝트: flyeven/scraperwiki-scraper-vault

function get_city_list($url)
{
    global $run_environment;
    global $max_records;
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    // table/tr/td/div/table/tr/td[2]/table/tr/td/table/tr[5]
    $content = $dom->find("div[id=ctl00_cphmain_pnlIndex]", 0)->find("table", 1);
    $count = 0;
    foreach ($content->find("a") as $link) {
        if ($link->href) {
            $city['source'] = 'http://www.floridaleagueofcities.com' . $link->href;
            $city['name'] = $link->plaintext;
            $cities[] = $city;
            $count++;
        }
    }
    // Clear memory
    $dom->__destruct();
    $content->__destruct();
    return $cities;
}

예제 #6

0

파일 보기

파일: dmap2_city_representatives_-_texas.php 프로젝트: flyeven/scraperwiki-scraper-vault

function get_city_data($url)
{
    global $run_environment;
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $count = 1;
    // for debugging
    if (!$dom->find("table", 0)) {
        echo $url;
        exit;
    }
    // /html/body/table/tbody/tr/td/div/section/div/table
    // $content = $dom->find("table", 0)->find("tr", 0)->find("td", 0)->find("div", 0)->find("table", 0)->find("tr", 0)->find("td", 2)->find("table", 0);
    $content = $dom->find("table", 0)->find("tr", 0)->find("td", 0)->find("div", 0)->find("section", 0)->find("div", 0)->find("table", 0);
    $city['source'] = $url;
    $city['name_full'] = $content->find("h2", 0)->plaintext;
    $city['name'] = substr($city['name_full'], strpos($city['name_full'], ' of ') + 4);
    $city['type'] = strtolower(substr($city['name_full'], 0, strpos($city['name_full'], ' of ')));
    $city['url'] = $content->find("tr", 5)->find("td", 1)->find("a", 0) ? $content->find("tr", 5)->find("td", 1)->find("a", 0)->href : null;
    $city['region'] = trim($content->find("tr", 6)->find("td", 1)->plaintext);
    $city['county'] = trim($content->find("tr", 7)->find("td", 1)->plaintext);
    $city['address1'] = trim($content->find("tr", 8)->find("td", 1)->plaintext);
    $city['address2'] = trim($content->find("tr", 9)->find("td", 1)->plaintext);
    $city['phone'] = trim($content->find("tr", 10)->find("td", 1)->plaintext);
    $city['fax'] = trim($content->find("tr", 11)->find("td", 1)->plaintext);
    $city['council_meeting_time'] = trim($content->find("tr", 12)->find("td", 1)->plaintext);
    $city['year_incorporated'] = trim($content->find("tr", 13)->find("td", 1)->plaintext);
    $city['fiscal_year_start'] = trim($content->find("tr", 14)->find("td", 1)->plaintext);
    $city['population'] = trim($content->find("tr", 15)->find("td", 1)->plaintext);
    $city['government_type'] = trim($content->find("tr", 16)->find("td", 1)->plaintext);
    $city['civil_service'] = trim($content->find("tr", 17)->find("td", 1)->plaintext);
    $rep_details = get_rep_details($content, $url, $city['name']);
    // Clear memory
    $dom->__destruct();
    $content->__destruct();
    if ($run_environment == 'dev') {
        $city['reps'] = $rep_details;
        return $city;
    } else {
        scraperwiki::save_sqlite(array('name_full', 'source'), $city, $table_name = 'jurisdiction');
        return true;
    }
}

예제 #7

0

파일 보기

파일: scrapt_monitor.php 프로젝트: flyeven/scraperwiki-scraper-vault

$url[] = "cpu";
$url[] = "motherboard";
$url[] = "memory";
$url[] = "internal-hard-drive";
$url[] = "video-card";
$url[] = "power-supply";
$url[] = "case";
$url[] = "monitor";
$html = scraperWiki::scrape("http://pcpartpicker.com/parts/monitor/");
$dom = new simple_html_dom();
$dom->load($html);
unset($html);
foreach ($dom->find("id=\"list_table\" tr") as $data) {
    $tds = $data->find("td");
    $tdsa = $data->find("td a");
    if (!empty($tds[0])) {
        $html_a = scraperWiki::scrape("http://pcpartpicker.com" . $tdsa[0]->href);
        $dom_a = new simple_html_dom();
        $dom_a->load($html_a);
        $table_a = $dom_a->find("table class=\"box-table-a\"");
        $rekod_a["href"] = $tdsa[0]->href;
        foreach ($table_a[0]->find("tr") as $data_a) {
            $tds_a = $data_a->find("td");
            $rekod_a[$tds_a[0]->plaintext] = $tds_a[1]->plaintext;
        }
        scraperwiki::save(array('href'), $rekod_a);
        //print json_encode($rekod_a) . "\n";
        $dom_a->__destruct();
    }
}
$dom->__destruct();

예제 #8

0

파일 보기

파일: city_representatives_-_nyc_community_boards_2.php 프로젝트: flyeven/scraperwiki-scraper-vault

function get_cb_data($name, $url)
{
    global $run_environment;
    global $max_records;
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $count = 1;
    foreach ($dom->find("table[class=cb_table]") as $board) {
        $cb = null;
        $cb['source'] = $url;
        $cb['borough'] = $name;
        $cb['community_board'] = trim($board->find("td[class=cb_title]", 0)->plaintext);
        $cb['community_board_number'] = trim(substr($cb['community_board'], strlen('Community Board ')));
        $cb['city_id'] = get_city_id($cb['borough'], $cb['community_board_number']);
        $cb['neighborhoods'] = trim($board->find("tr", 1)->find("td", 2)->plaintext);
        $cb['precincts'] = trim($board->find("tr", 4)->find("td", 1)->plaintext);
        $cb['precinct_phones'] = trim($board->find("tr", 5)->find("td", 1)->plaintext);
        // Try to parse the unstructured contact info text
        $cb_info = trim($board->find("tr", 3)->find("td", 1)->innertext);
        $cb_info = str_replace('<strong>', '<b>', $cb_info);
        $cb_info = str_replace('</strong>', '</b>', $cb_info);
        $contacts = explode("<b>", $cb_info);
        foreach ($contacts as $val) {
            $val = str_replace("<br />", ",", $val);
            $val = trim($val);
            $val = explode("</b>", $val);
            array_walk($val, create_function('&$val', '$val = trim($val);'));
            if (!empty($val[1])) {
                $heading = trim($val[0], ",");
                $heading = trim($heading, ":");
                $heading = strtolower(str_replace(' ', '_', $heading));
                // Clean up stray html tags
                if (stripos($val[1], '<span>')) {
                    $val[1] = get_between($val[1], '<span>', '</span>');
                }
                $val[1] = trim($val[1], '</p>');
                $val[1] = trim($val[1], ',');
                $val[1] = trim($val[1], ',');
                $cb[$heading] = $val[1];
            }
        }
        // check if we have data in the email field that needs to be parsed like the website url
        if (!empty($cb['address'])) {
            $cb['address'] = trim($cb['address']);
            $cb['address'] = trim($cb['address'], ',');
            $cb['address'] = str_replace(",,", ",", $cb['address']);
            $cb['address'] = trim($cb['address']);
            $lines = explode(',', $cb['address']);
            $line_num = count($lines) - 1;
            if ($line_num >= 4) {
                $cb['address_title'] = $lines[$line_num - 4];
            } else {
                $cb['address_title'] = $cb['borough'] . ' ' . $cb['community_board'];
            }
            if ($cb['address_title'] == $lines[$line_num - 3]) {
                $cb['address_1'] = $lines[$line_num - 2];
                $cb['address_2'] = null;
            } else {
                $cb['address_1'] = $lines[$line_num - 3];
                $cb['address_2'] = $lines[$line_num - 2];
            }
            $zip = trim($lines[$line_num], ', NY ');
            $cb['address_zip'] = $zip;
            $cb['address_city'] = $lines[$line_num - 1];
            $cb['address_state'] = 'NY';
        }
        // check if we have data in the email field that needs to be parsed like the website url
        if (!empty($cb['email'])) {
            $snippet = new simple_html_dom();
            $snippet->load($cb['email']);
            if ($snippet->find('a', 0)) {
                // Isolate the email address from the other html
                if (stripos($cb['email'], '<a') > 0) {
                    $cb['email'] = trim(substr($cb['email'], 0, stripos($cb['email'], '<a')));
                    if (count($emails = explode(',', $cb['email'])) > 1) {
                        $cb['all_email'] = $cb['email'];
                        $cb['email'] = trim($emails[0]);
                        $cb['email'] = trim($cb['email'], '&#160;');
                    }
                } else {
                    $cb['email'] = null;
                    $cb['website'] = null;
                }
                $cb['website'] = $snippet->find('a', 0)->href;
                // External URLs have a proxy URL on nyc.gov, let's parse that off
                if (stripos($cb['website'], 'exit.pl')) {
                    $cb['website'] = substr($cb['website'], stripos($cb['website'], 'exit.pl?') + 12);
                }
            } else {
                $cb['website'] = null;
            }
        } else {
            $cb['email'] = null;
        }
        // Make this field universal, even if we don't have any data
        if (empty($cb['all_email'])) {
            $cb['all_email'] = null;
        }
        // verify we didn't mix up website and email
        if (!empty($cb['website']) && stripos($cb['website'], 'mailto') !== FALSE) {
            $cb['email'] = substr($cb['website'], stripos($cb['website'], 'mailto:') + 7);
            $cb['website'] = null;
        }
        // Be sure to clear any stray commas
        if (!empty($cb['email'])) {
            $cb['email'] = trim($cb['email'], ',');
        }
        // normalize field names
        if (!empty($cb['chairperson'])) {
            $cb['chair'] = $cb['chairperson'];
            unset($cb['chairperson']);
        }
        if ($run_environment == 'dev') {
            $cbs[] = $cb;
        } else {
            scraperwiki::save_sqlite(array('source', 'borough', 'community_board_number'), $cb, $table_name = 'community_board');
        }
        $count++;
        //if ($run_environment == 'dev' && $count > $max_records) break;
        // Clear memory
        $board->__destruct();
    }
    // Clear memory
    $dom->__destruct();
    if ($run_environment == 'dev') {
        return $cbs;
    } else {
        return true;
    }
}

예제 #9

0

파일 보기

파일: restaurantes.php 프로젝트: flyeven/scraperwiki-scraper-vault

            $email_rest = substr($datos_rest[1]->innertext, $posicion_corte);
        } else {
            $email_rest = "";
        }
        $iframe = $dom_rest->find("div.TabbedPanelsContent iframe");
        $posicion_corte = strpos($iframe[0]->src, "ll=") + 3;
        $coords_rest = substr($iframe[0]->src, $posicion_corte);
        $posicion_corte = strpos($coords_rest, "&");
        $coords_rest = substr($coords_rest, 0, $posicion_corte);
        $coords = explode(",", $coords_rest);
        $lat_rest = $coords[0];
        $lon_rest = $coords[1];
        $web_rest = $dom_rest->find("span.url a.external");
        if ($web_rest) {
            $web_rest = $web_rest[0]->href;
        } else {
            $web_rest = "";
        }
        $type_rest = $dom_rest->find("div.col_02 p");
        $type_rest = $type_rest[0]->innertext;
        $desc_rest = $dom_rest->find("div.col_02");
        $posicion_corte = strpos($desc_rest[1]->innertext, "Descripci") + 54;
        $desc_rest = strip_tags(substr($desc_rest[1]->innertext, $posicion_corte));
        $desc_rest = $type_rest . " - " . $desc_rest;
        $restaurante = array("nombre" => utf8_encode($nombre_rest), "direccion" => utf8_encode($direccion_rest), "telefono" => $telefono_rest, "descripcion" => utf8_encode($desc_rest), "lattitude" => $lat_rest, "longitude" => $lon_rest);
        $restaurantes[] = $restaurante;
        $dom_rest->__destruct();
    }
    $dom->__destruct();
}
scraperwiki::save(array('nombre'), $restaurantes);

예제 #10

0

파일 보기

파일: instyle.php 프로젝트: NYC2015/team-12

 /**
  * Convert Embedded CSS to Inline
  * @param string $document
  * @param bool $strip_class strip attribute class
  */
 function convert($document, $strip_class = false)
 {
     // Debug mode
     // Debug mode will output selectors and styles that are detected in the embedded CSS
     $debug = false;
     // Extract the CSS
     preg_match('/<style[^>]+>(?<css>[^<]+)<\\/style>/s', $document, $matches);
     // If no CSS style
     if (empty($matches)) {
         return $document;
     }
     // Strip out extra newlines and tabs from CSS
     $css = preg_replace("/[\n\r\t]+/s", "", $matches['css']);
     // Extract each CSS declaration
     preg_match_all('/([-a-zA-Z0-9_ ,#\\.]+){([^}]+)}/s', $css, $rules, PREG_SET_ORDER);
     // For each CSS declaration, make the selector and style declaration into an array
     // Array index 1 is the CSS selector
     // Array index 2 is the CSS rule(s)
     foreach ($rules as $rule) {
         // If the CSS selector is multiple, we should split them up
         if (strstr($rule['1'], ',')) {
             // Strip out spaces after a comma for consistency
             $rule['1'] = str_replace(', ', ',', $rule['1']);
             // Unset any previous combos
             unset($selectors);
             // Make each selector declaration its own
             // Create a separate array element in styles array for each declaration
             $selectors = explode(',', $rule['1']);
             foreach ($selectors as $selector) {
                 $selector = trim($selector);
                 if (!isset($styles[$selector])) {
                     $styles[$selector] = '';
                 }
                 $styles[$selector] .= trim($rule['2']);
                 if ($debug) {
                     echo $selector . ' { ' . trim($rule['2']) . ' }<br/>';
                 }
             }
         } else {
             $selector = trim($rule['1']);
             if (!isset($styles[$selector])) {
                 $styles[$selector] = '';
             }
             $styles[$selector] .= trim($rule['2']);
             if ($debug) {
                 echo $selector . ' { ' . trim($rule['2']) . ' }<br/>';
             }
         }
     }
     // DEBUG: Show selector and declaration
     if ($debug) {
         echo '<pre>';
         foreach ($styles as $selector => $styling) {
             echo $selector . ':<br>';
             echo $styling . '<br/><br/>';
         }
         echo '</pre><hr/>';
     }
     // For each style declaration, find the selector in the HTML and add the inline CSS
     if (!empty($styles)) {
         // Load Simple HTML DOM helper
         require_once 'simple_html_dom.php';
         $html_dom = new simple_html_dom();
         // Load in the HTML without the head and style definitions
         $html_dom->load(preg_replace('/\\<head\\>(.+?)\\<\\/head>/s', '', $document));
         foreach ($styles as $selector => $styling) {
             foreach ($html_dom->find($selector) as $element) {
                 // Check to make sure the style doesn't already exist
                 if (!stristr($element->style, $styling)) {
                     if (strlen($element->style) > 0 && substr(rtrim($element->style), -1) !== ';') {
                         $element->style .= ';';
                     }
                     // If there is any existing style, this will append to it
                     $element->style .= $styling;
                 }
             }
         }
         $inline_css_message = $html_dom->save();
         // Strip class attribute
         if ($strip_class === true) {
             $inline_css_message = preg_replace('~(<[a-z0-0][^>]*)(\\s(?:class|id)\\s*=\\s*(([\'"]).*?\\4|[^\\s]*))~usi', '\\1', $inline_css_message);
         }
         $html_dom->__destruct();
         return $inline_css_message;
     }
     return false;
 }

예제 #11

0

파일 보기

파일: scrape.php 프로젝트: asavagar/EU-data-cloud

function _scrapeIndexPage($url)
{
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $result = array();
    foreach ($dom->find("div[@id='paddingLR12'] p") as $data) {
        $as = $data->find("a");
        $record = array('title' => $as[0]->plaintext, 'url' => URL_BASE . substr($as[0]->href, 1));
        $result[] = $record;
    }
    $dom->__destruct();
    return $result;
}

예제 #12

0

파일 보기

파일: tedscrapper_page.php 프로젝트: flyeven/scraperwiki-scraper-vault

function scrapeTEDDataPage($url, $sector)
{
    $record = array();
    // print ("a \n");
    if (strcmp($url, "http://ted.europa.eu/") === 0) {
        return $record;
    }
    //print ("b \n");
    $time = microtime(true);
    // $html = scraperWiki::scrape($url);
    $curl = curl_init($url);
    //print ("d \n");
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    //print ("e \n");
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    //print ("f \n");
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
    //print ("g \n");
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_TIMEOUT, 10);
    // 10 second before aborting
    // try CURLOPT_CONNECTTIMEOUT (in seconds)
    // try CURLOPT_LOW_SPEED_LIMIT (to define what slow is, with):
    // curl_setopt($curl, CURLOPT_LOW_SPEED_TIME, 10); (10 second at low speed before aborting
    //print ("h \n");
    $html = curl_exec($curl);
    //print ("i ".curl_error($curl)."\n");
    curl_close($curl);
    //print ("j \n");
    $dom = new simple_html_dom();
    //print ("k \n");
    $dom->load($html);
    print "......done in " . (microtime(true) - $time) . "s " . memory_get_usage() / 1000000 . "MB\n";
    $tabletrs = $dom->find("table[class=data] tr");
    $record = array('ID' => $url, 'sector' => $sector);
    foreach ($tabletrs as $tabletr) {
        $th = $tabletr->find("th");
        $tds = $tabletr->find("td");
        $record[$th[0]->plaintext] = $tds[1]->plaintext;
        unset($th);
        unset($tds);
    }
    unset($tabletrs);
    $dom->__destruct();
    unset($dom);
    unset($html);
    scraperwiki::save(array('ID', 'sector'), $record);
    sleep(2);
    return $record;
}

예제 #13

0

파일 보기

파일: apdocs.php 프로젝트: flyeven/scraperwiki-scraper-vault

            $doc_name = trim($tr->children(1)->plaintext);
            $father_name = trim($tr->children(2)->plaintext);
            $quals_name = trim($tr->children(3)->plaintext);
            $univ_name = trim($tr->children(4)->plaintext);
            $link = $tr->find('a', 0);
            if ($link != null) {
                $link_text = $link->href;
                $inner_html = scraperWiki::scrape("http://www.apmedicalcouncil.com/" . $link_text);
                $inner_dom->load($inner_html);
                $birth_date = trim($inner_dom->find('div[id="birth"]', 0)->plaintext);
                $sex = trim($inner_dom->find('div[id="sex"]', 0)->plaintext);
                $quals = trim($inner_dom->find('div[id="qualification"]', 0)->plaintext);
                $quals2 = trim($inner_dom->find('div[id="addqualification"]', 0)->plaintext);
                $add = trim($inner_dom->find('div[id="address"]', 0)->plaintext);
                $add_more = trim($inner_dom->find('div[id="address2"]', 0)->plaintext);
                $record = array('regd_num' => $regd_num, 'doc_name' => $doc_name, 'father_name' => $father_name, 'quals_name' => $quals_name, 'univ_name' => $univ_name, 'birth_date' => $birth_date, 'sex' => $sex, 'qualification' => $quals, 'advanced_quals' => $quals2, 'address' => $add, 'add_more' => $add_more);
            } else {
                $record = array('regd_num' => $regd_num, 'doc_name' => $doc_name, 'father_name' => $father_name, 'quals_name' => $quals_name, 'univ_name' => $univ_name);
            }
            print_r($record);
            $counter++;
            //print_r($link);
            //scraperwiki::save_sqlite(array("doc_name"),$record,"AP_Docs");
            if ($counter == 2) {
                break;
            }
        }
    }
}
$inner_dom->__destruct();
$dom->__destruct();

예제 #14

0

파일 보기

파일: new_york_city_council.php 프로젝트: flyeven/scraperwiki-scraper-vault

function get_council_list($url)
{
    global $run_environment;
    global $max_records;
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    // table/tr/td/div/table/tr/td[2]/table/tr/td/table/tr[5]
    $content = $dom->find("table[id=members_table]", 0);
    $count = 1;
    foreach ($content->find("tr") as $row) {
        if ($count > 1) {
            $councilmember['name'] = $row->find("td", 0)->plaintext;
            $councilmember['source'] = 'http://council.nyc.gov' . $row->find("td", 0)->find("a", 0)->href;
            $councilmember['district'] = $row->find("td", 1)->plaintext;
            $councilmember['borough'] = $row->find("td", 2)->plaintext;
            $councilmember['party'] = $row->find("td", 3)->plaintext;
            $council[] = $councilmember;
        }
        $count++;
    }
    // Clear memory
    $dom->__destruct();
    $content->__destruct();
    return $council;
}

예제 #15

0

파일 보기

파일: open311_services.php 프로젝트: flyeven/scraperwiki-scraper-vault

function get_link_list($url)
{
    global $run_environment;
    global $max_records;
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $content = $dom->find("table[class=wikitable]", 0);
    $count = 1;
    $endpoints = array();
    foreach ($content->find("tr") as $row) {
        //Skip first line
        if ($count == 1) {
            $count++;
            continue;
        }
        $endpoint['jurisdiction'] = $row->find("td", 0) ? trim($row->find("td", 0)->plaintext) : null;
        $endpoint['services'] = $row->find("td", 6)->find("a", 0) ? $row->find("td", 6)->find("a", 0)->href : null;
        $endpoint['base_url'] = $endpoint['services'] ? substr($endpoint['services'], 0, strpos($endpoint['services'], 'services.xml')) : null;
        // Make sure we get the correct URL for discovery
        if ($row->find("td", 4)->find("a", 0)) {
            foreach ($row->find("td", 4)->find("a") as $link) {
                if (strtolower($link->plaintext) == 'xml') {
                    $endpoint['discovery'] = $link->href;
                }
            }
        }
        if (empty($endpoint['discovery'])) {
            $endpoint['discovery'] = null;
        }
        $endpoints[] = $endpoint;
        unset($endpoint);
        $count++;
    }
    // Clear memory
    $dom->__destruct();
    $content->__destruct();
    return $endpoints;
}

예제 #16

0

파일 보기

파일: scrape.php 프로젝트: asavagar/EU-data-cloud

function _handleDetailPage($url, $result)
{
    $html = _getHTML($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $divBoxes = $dom->find("div[@class='featured_box margin_top_fb']");
    if (count($divBoxes) < 1) {
        return $result;
    }
    $style = $divBoxes[0]->style;
    $styleParts = explode("'", $style);
    $imageURL = EUROPA_URL_BASE . $styleParts[1];
    $result['logoURL'] = $imageURL;
    $addressText = $divBoxes[0]->xmltext;
    $addressTextParts = explode('</h3>', $addressText);
    if (count($addressTextParts) !== 2) {
        return $result;
    }
    $addressText = $addressTextParts[1];
    $addressText = str_replace('<br />', '<br>', $addressText);
    $addressTextParts = explode('<br>', $addressText);
    if (count($addressTextParts) < 3) {
        return $result;
    }
    $street = trim($addressTextParts[0]);
    $matches = array();
    $curPos = 1;
    preg_match('/^.*[0-9]+.*$/', $street, $matches);
    if (count($matches) === 0) {
        $street .= ' ' . trim($addressTextParts[$curPos++]);
    }
    $result['zipCity'] = trim($addressTextParts[$curPos++]);
    $result['country'] = trim($addressTextParts[$curPos++]);
    for ($i = $curPos; $i < count($addressTextParts); ++$i) {
        $val = strtolower(trim($addressTextParts[$i]));
        if (substr($val, 0, 4) === 'tel:') {
            $result['tel'] = trim(substr($val, 4));
        } else {
            if (substr($val, 0, 4) === 'fax:') {
                $result['fax'] = trim(substr($val, 4));
            } else {
                if (substr($val, 0, 2) === '<a') {
                    $parts = explode('"', $val);
                    $result['mailto'] = trim($parts[1]);
                }
            }
        }
    }
    $links = array();
    $aElements = $dom->find("div[@id='euCenter'] a");
    foreach ($aElements as $a) {
        if (strpos($a->href, 'http://') !== false) {
            $links[] = $a->href;
        }
    }
    if (count($links) > 0) {
        if (isset($result['seeAlso'])) {
            $result['seeAlso'] = array_merge($result['seeAlso'], $links);
        } else {
            $result['seeAlso'] = $links;
        }
    }
    $dom->__destruct();
    return $result;
}

PHP simple_html_dom::__destruct 예제들

__destruct() 공개 메소드