function scraper($url_search, $country_id) { unset($url_next_size, $url_next, $has_next, $thisdir, $url_first, $date); $scraper_date = date("Y-m-d"); $has_next = false; $url_size = strlen($url_search); $base_url = "http://ec.europa.eu/eures/eures-searchengine/servlet"; $scraper_hour = date("H:i:s"); $html = scraperwiki::scrape($url_search); $dom = new simple_html_dom(); $dom->load($html); //Gets each job result foreach ($dom->find('table[class=JResult]') as $result) { //Gets the job URL foreach ($result->find('td[class=JRTitle] a') as $job_page) { $chars = explode("'", $job_page->onclick); $url_job_unique = substr($chars[1], 1); $url_job = $base_url . $url_job_unique; $url_id = strstr($url_job, 'uniqueJvId='); $url_id = str_replace('uniqueJvId=', "", $url_id); $url_job_unique = str_replace('/ShowJvServlet?lg=EN&serviceUri=', "", $url_job_unique); $url_job_unique_slashless = str_replace('/', "*", $url_job_unique); echo "JOB: " . $url_job . "<br />"; } $file = 'jobs/' . $country_id . '/' . $url_job_unique_slashless . '.html'; if (!file_exists($file)) { //Gets the job description and source foreach ($result->find('th') as $data) { $text = trim($data->plaintext); if ($text == 'Description:') { $description = trim($data->next_sibling()->plaintext); echo "DESCRIPTION: " . $description . "<br />"; } if ($text == 'Source:') { $source = trim($data->next_sibling()->plaintext); echo "SOURCE: " . $source . "<br /><br />"; } } //Gets the HTML from the Job $html_job = scraperwiki::scrape($url_job); //Saves the search data in a CSV file $fp = fopen('jobs/' . $country_id . '_' . $scraper_date . '.csv', 'a+'); $list = array(array($url_job, $url_id, $url_job_unique, $url_job_unique_slashless, $description, $source, $url_search, $country_id, $scraper_date, $scraper_hour)); foreach ($list as $fields) { fputcsv($fp, $fields); } fclose($fp); //Saves the HTML in a file $fh = fopen($file, 'w'); fwrite($fh, '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'); fwrite($fh, $html_job); fwrite($fh, '<div id=url_job>' . $url_job . '</div>'); fwrite($fh, '<div id=country_id>' . $country_id . '</div>'); fwrite($fh, '<div id=description>' . $description . '</div>'); fwrite($fh, '<div id=source>' . $source . '</div>'); fwrite($fh, '<div id=url_id>' . $url_id . '</div>'); fwrite($fh, '<div id=url_job_unique>' . $url_job_unique . '</div>'); fwrite($fh, '<div id=url_job_unique_slashless>' . $url_job_unique_slashless . '</div>'); fwrite($fh, '<div id=url_search>' . $url_search . '</div>'); fwrite($fh, '<div id=scraper_date>' . $scraper_date . '</div>'); fwrite($fh, '<div id=scraper_hour>' . $scraper_hour . '</div>'); fclose($fh); } else { echo "Job already extracted."; } } //Gets the next search page foreach ($dom->find('div[class=prevNext] a') as $next_page) { $text = $next_page->plaintext; if ($text == "Next page") { $url_next = substr($next_page->href, 1); $url_next = $base_url . $url_next; $has_next = true; print "<br /><br />NEXT: " . $url_next . "<br /><br />"; } } unset($html, $dom, $result, $job_page, $data, $next_page, $text, $url_id, $url_job, $description, $source, $source_id, $url_search, $html_job, $scraper_date, $scraper_hour, $url_job_unique, $url_job_unique_slashless, $chars, $fh, $list, $file); //Calls the next search page to scrap if ($has_next == true) { if ($url_size <= 4101) { sleep(1); //waits before the next extraction scraper($url_next, $country_id); } else { echo "Page URL size is to big."; } } else { echo "No more pages to scrap."; } }
} foreach ($dom->find('div[class=prevNext] a') as $next_page) { $text = $next_page->plaintext; if ($text == "Next page") { $url_next = substr($next_page->href, 1); $url_next = $base_url . $url_next; $has_next = true; print "<br /><br />NEXT: " . $url_next . "<br /><br />"; } } unset($html, $dom, $result, $job_page, $data, $next_page, $text, $url_id, $url_job, $description, $source, $source_id, $url_search); //Comment this for tests, uncomment this to get all data // if ($has_next == true){ // sleep(1); // scraper($url_next, $country_id); // } } //Comment this for tests, uncomment this to get all data $country = array('AT', 'BG', 'CY', 'CZ', 'DK', 'EE', 'FI', 'FR', 'DE', 'GR', 'HU', 'IS', 'IR', 'IT', 'LV', 'LI', 'LT', 'LU', 'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'SK', 'SI', 'ES', 'SE', 'CH', 'UK', 'BE'); //$country = array('AT'); $page_size = 99; $sql_update = mysql_query("SELECT * FROM update_service"); $count_update = mysql_num_rows($sql_update); $day = "01"; $month = "01"; $year = "1975"; for ($i = 0; $i < sizeof($country); $i++) { $url_first = "http://ec.europa.eu/eures/eures-searchengine/servlet/BrowseCountryJVsServlet?lg=EN&isco=&country=" . $country[$i] . "&multipleRegions=%25&date=" . $day . "%2F" . $month . "%2F" . $year . "&title=&durex=&exp=&qual=&pageSize=" . $page_size . "&totalCount=999999999&startIndexes=0-1o1-1o2-1I0-2o1-30o2-1I0-3o1-59o2-1I0-4o1-88o2-1I&page=1"; scraper($url_first, $country[$i]); } mysql_query("INSERT INTO update_service SET date = SYSDATE(), hour = SYSDATE(),type='url'");