PHP scraper Beispiele

Programmiersprache: PHP

Methode / Funktion: scraper

Beispiele auf hotexamples.com: 2

PHP scraper - 2 Beispiele gefunden. Dies sind die am besten bewerteten PHP Beispiele für die scraper, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

Datei: page_scraper.php Projekt: asavagar/EU-data-cloud

function scraper($url_search, $country_id)
{
    unset($url_next_size, $url_next, $has_next, $thisdir, $url_first, $date);
    $scraper_date = date("Y-m-d");
    $has_next = false;
    $url_size = strlen($url_search);
    $base_url = "http://ec.europa.eu/eures/eures-searchengine/servlet";
    $scraper_hour = date("H:i:s");
    $html = scraperwiki::scrape($url_search);
    $dom = new simple_html_dom();
    $dom->load($html);
    //Gets each job result
    foreach ($dom->find('table[class=JResult]') as $result) {
        //Gets the job URL
        foreach ($result->find('td[class=JRTitle] a') as $job_page) {
            $chars = explode("'", $job_page->onclick);
            $url_job_unique = substr($chars[1], 1);
            $url_job = $base_url . $url_job_unique;
            $url_id = strstr($url_job, 'uniqueJvId=');
            $url_id = str_replace('uniqueJvId=', "", $url_id);
            $url_job_unique = str_replace('/ShowJvServlet?lg=EN&serviceUri=', "", $url_job_unique);
            $url_job_unique_slashless = str_replace('/', "*", $url_job_unique);
            echo "JOB: " . $url_job . "<br />";
        }
        $file = 'jobs/' . $country_id . '/' . $url_job_unique_slashless . '.html';
        if (!file_exists($file)) {
            //Gets the job description and source
            foreach ($result->find('th') as $data) {
                $text = trim($data->plaintext);
                if ($text == 'Description:') {
                    $description = trim($data->next_sibling()->plaintext);
                    echo "DESCRIPTION: " . $description . "<br />";
                }
                if ($text == 'Source:') {
                    $source = trim($data->next_sibling()->plaintext);
                    echo "SOURCE: " . $source . "<br /><br />";
                }
            }
            //Gets the HTML from the Job
            $html_job = scraperwiki::scrape($url_job);
            //Saves the search data in a CSV file
            $fp = fopen('jobs/' . $country_id . '_' . $scraper_date . '.csv', 'a+');
            $list = array(array($url_job, $url_id, $url_job_unique, $url_job_unique_slashless, $description, $source, $url_search, $country_id, $scraper_date, $scraper_hour));
            foreach ($list as $fields) {
                fputcsv($fp, $fields);
            }
            fclose($fp);
            //Saves the HTML in a file
            $fh = fopen($file, 'w');
            fwrite($fh, '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />');
            fwrite($fh, $html_job);
            fwrite($fh, '<div id=url_job>' . $url_job . '</div>');
            fwrite($fh, '<div id=country_id>' . $country_id . '</div>');
            fwrite($fh, '<div id=description>' . $description . '</div>');
            fwrite($fh, '<div id=source>' . $source . '</div>');
            fwrite($fh, '<div id=url_id>' . $url_id . '</div>');
            fwrite($fh, '<div id=url_job_unique>' . $url_job_unique . '</div>');
            fwrite($fh, '<div id=url_job_unique_slashless>' . $url_job_unique_slashless . '</div>');
            fwrite($fh, '<div id=url_search>' . $url_search . '</div>');
            fwrite($fh, '<div id=scraper_date>' . $scraper_date . '</div>');
            fwrite($fh, '<div id=scraper_hour>' . $scraper_hour . '</div>');
            fclose($fh);
        } else {
            echo "Job already extracted.";
        }
    }
    //Gets the next search page
    foreach ($dom->find('div[class=prevNext] a') as $next_page) {
        $text = $next_page->plaintext;
        if ($text == "Next page") {
            $url_next = substr($next_page->href, 1);
            $url_next = $base_url . $url_next;
            $has_next = true;
            print "<br /><br />NEXT: " . $url_next . "<br /><br />";
        }
    }
    unset($html, $dom, $result, $job_page, $data, $next_page, $text, $url_id, $url_job, $description, $source, $source_id, $url_search, $html_job, $scraper_date, $scraper_hour, $url_job_unique, $url_job_unique_slashless, $chars, $fh, $list, $file);
    //Calls the next search page to scrap
    if ($has_next == true) {
        if ($url_size <= 4101) {
            sleep(1);
            //waits before the next extraction
            scraper($url_next, $country_id);
        } else {
            echo "Page URL size is to big.";
        }
    } else {
        echo "No more pages to scrap.";
    }
}

Beispiel #2

Datei anzeigen

Datei: eures_url_scraper.php Projekt: asavagar/EU-data-cloud

    }
    foreach ($dom->find('div[class=prevNext] a') as $next_page) {
        $text = $next_page->plaintext;
        if ($text == "Next page") {
            $url_next = substr($next_page->href, 1);
            $url_next = $base_url . $url_next;
            $has_next = true;
            print "<br /><br />NEXT: " . $url_next . "<br /><br />";
        }
    }
    unset($html, $dom, $result, $job_page, $data, $next_page, $text, $url_id, $url_job, $description, $source, $source_id, $url_search);
    //Comment this for tests, uncomment this to get all data
    //	if ($has_next == true){
    //		sleep(1);
    //		scraper($url_next, $country_id);
    //	}
}
//Comment this for tests, uncomment this to get all data
$country = array('AT', 'BG', 'CY', 'CZ', 'DK', 'EE', 'FI', 'FR', 'DE', 'GR', 'HU', 'IS', 'IR', 'IT', 'LV', 'LI', 'LT', 'LU', 'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'SK', 'SI', 'ES', 'SE', 'CH', 'UK', 'BE');
//$country = array('AT');
$page_size = 99;
$sql_update = mysql_query("SELECT * FROM update_service");
$count_update = mysql_num_rows($sql_update);
$day = "01";
$month = "01";
$year = "1975";
for ($i = 0; $i < sizeof($country); $i++) {
    $url_first = "http://ec.europa.eu/eures/eures-searchengine/servlet/BrowseCountryJVsServlet?lg=EN&isco=&country=" . $country[$i] . "&multipleRegions=%25&date=" . $day . "%2F" . $month . "%2F" . $year . "&title=&durex=&exp=&qual=&pageSize=" . $page_size . "&totalCount=999999999&startIndexes=0-1o1-1o2-1I0-2o1-30o2-1I0-3o1-59o2-1I0-4o1-88o2-1I&page=1";
    scraper($url_first, $country[$i]);
}
mysql_query("INSERT INTO update_service SET date = SYSDATE(), hour = SYSDATE(),type='url'");