function scraper($url_search, $country_id) { $has_next = false; $base_url = "http://ec.europa.eu/eures/eures-searchengine/servlet"; $html = scraperwiki::scrape($url_search); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find('table[class=JResult]') as $result) { foreach ($result->find('td[class=JRTitle] a') as $job_page) { $chars = explode("'", $job_page->onclick); $url_job = $base_url . substr($chars[1], 1); $url_id = strstr($url_job, 'uniqueJvId='); $url_id = str_replace('uniqueJvId=', "", $url_id); echo "JOB: " . $url_job . "<br />"; } foreach ($result->find('th') as $data) { $text = trim($data->plaintext); if ($text == 'Description:') { $description = trim($data->next_sibling()->plaintext); echo "DESCRIPTION: " . $description . "<br />"; } if ($text == 'Source:') { $source = trim($data->next_sibling()->plaintext); $source = str_replace("'", "\\'", $source); if ($source != '' && $source != ' ') { $source_id = insert_name('source', $source); echo "SOURCE: " . $source . "<br /><br />"; } } } $description = str_replace("'", "\\'", $description); $description = str_replace("</BR>", "", $description); $sql = mysql_query("SELECT * FROM job WHERE url = '{$url_job}'"); $cont = mysql_num_rows($sql); if ($cont == 0) { mysql_query("INSERT INTO job SET \n\t\t\t\t\turl = '{$url_job}', \n\t\t\t\t\turl_id = '{$url_id}', \n\t\t\t\t\tdescription = '{$description}', \n\t\t\t\t\tsource_id = '{$source_id}', \n\t\t\t\t\turl_search = '{$url_search}', \n\t\t\t\t\tcountry_id='{$country_id}',\n\t\t\t\t\turl_scraper_date = SYSDATE(),\t \n\t\t\t\t\turl_scraper_hour = SYSDATE()"); } else { echo "Job URL already extracted: " . $url_job . "<br /><br />"; } } foreach ($dom->find('div[class=prevNext] a') as $next_page) { $text = $next_page->plaintext; if ($text == "Next page") { $url_next = substr($next_page->href, 1); $url_next = $base_url . $url_next; $has_next = true; print "<br /><br />NEXT: " . $url_next . "<br /><br />"; } } unset($html, $dom, $result, $job_page, $data, $next_page, $text, $url_id, $url_job, $description, $source, $source_id, $url_search); //Comment this for tests, uncomment this to get all data // if ($has_next == true){ // sleep(1); // scraper($url_next, $country_id); // } }
case 'Phone:': $phone = $value; break; case 'Email:': $email = $value; break; case 'Fax:': $fax = $value; $fax_array = format_fax($fax); break; //Application //Application case 'How to apply:': if (strlen($value) > 0) { $how_to_apply = $value; $how_to_apply_id = insert_name('how_to_apply', $how_to_apply); } break; case 'Contact:': $contact = $value; break; case 'Last date for application:': $last_date_for_application = $value; break; //Other Information //Other Information case 'Date published:': $date_published = $value; break; case 'National reference:': $national_reference = $value;
function import_basisrooster($file_id, $tmp_name) { global $grp2ppl, $stamz; lock_renew_helper(1); $udmz = read_udmz_file($tmp_name); // a previous update may have gone wrong, cleanup just in case mdb2_exec("DELETE FROM files2lessen WHERE file_id = {$file_id}"); mdb2_exec("DELETE FROM grp2ppl WHERE file_id_basis = {$file_id}"); mdb2_exec("DELETE FROM grp2grp WHERE file_id_basis = {$file_id}"); // eerst lopen we alle leerlingen langs // dan alle docenten // dan alle groepen // en als laatste: alle lessen if (!checkset($udmz, 'udmz file', array('Groep', 'Leerling', 'Docent', 'Les'))) { return; } $leerlingen = 0; $categorieen_leerling = 0; foreach ($udmz['Leerling'] as $category => $value) { if (in_array($category, config('ZERMELO_CATEGORY_IGNORE'))) { continue; } $categorieen_leerling++; $leerlingen += count($value); } $docenten = count($udmz['Docent']); $groepen = 0; $categorieen_groep = 0; foreach ($udmz['Groep'] as $category => $value) { if (in_array($category, config('ZERMELO_CATEGORY_IGNORE'))) { continue; } $categorieen_groep++; foreach ($value as $id => $row) { if (in_array($id, config('ZERMELO_GROUP_IGNORE'))) { continue; } $groepen++; } } $lessen = count($udmz['Les']); /* logit('Categorieen leerling: '.$categorieen_leerling); logit('Leerlingen: '.$leerlingen); logit('Docenten: '.$docenten); logit('Categorieen groep: '.$categorieen_groep); logit('Groepen: '.$groepen); logit('Lessen: '.$lessen); */ $total = $categorieen_leerling + $leerlingen + $categorieen_groep + $groepen + $docenten + $lessen; $done = 0; lock_renew_helper(2, $done / $total); foreach ($udmz['Leerling'] as $category => $list) { if (in_array($category, config('ZERMELO_CATEGORY_IGNORE'))) { continue; } incdone($done, $total, 2); if (!($category_id = add_entity($category, CATEGORIE))) { return; } foreach ($list as $id => $row) { incdone($done, $total, 2); if (!checkset($row, "Leerling.{$category}", array('LASTNAME', 'FIRSTNAME', 'BETWEENNAME', 'BASICCLASS'))) { return; } if (!($leerling_id = add_entity($id, LEERLING))) { return; } insert_name($leerling_id, $row['FIRSTNAME'], $row['BETWEENNAME'], $row['LASTNAME']); if (!($lesgroep_id = add_entity($row['BASICCLASS'], STAMKLAS))) { return; } $stamz[$row['BASICCLASS']] = $category; add_basis_grp2ppl($lesgroep_id, $leerling_id, $file_id); add_basis_grp2ppl($category_id, $leerling_id, $file_id); } } lock_renew_helper(2, $done / $total); foreach ($udmz['Docent'] as $id => $row) { incdone($done, $total, 2); if (!checkset($row, 'Docent', array('Voornaam', 'Tussenvoegsel', 'Achternaam', 'e-mail'))) { continue; } if (!($docent_id = add_entity($id, DOCENT))) { return; } if ($row['Achternaam'] != '' && $row['Voornaam'] != '') { insert_name($docent_id, substr($row['Voornaam'], 0, 1) . '.', $row['Tussenvoegsel'], $row['Achternaam'], $row['e-mail']); } } lock_renew_helper(2, $done / $total); foreach ($udmz['Groep'] as $category => $list) { if (in_array($category, config('ZERMELO_CATEGORY_IGNORE'))) { continue; } incdone($done, $total, 2); if (!($category_id = add_entity($category, CATEGORIE))) { return; } foreach ($list as $id => $row) { if (in_array($id, config('ZERMELO_GROUP_IGNORE'))) { continue; } incdone($done, $total, 2); if (isset($stamz[$id])) { if ($stamz[$id] != $category) { logit('stamklas in andere categorie?!?!?'); } else { continue; } // doe niks, want stamklassen hebben we al } if (!checkset($row, "Groep.{$category}", array('SET'))) { return; } if (!($lesgroep_id = add_entity(config('IGNORE_BEFORE_DOT') ? $id : $category . '.' . $id, LESGROEP))) { return; } if ($row['SET'] == '') { continue; } // geen leerlingen in deze groep foreach (explode(',', $row['SET']) as $leerlingnummer) { if (!($leerling_id = add_entity($leerlingnummer, LEERLING))) { return; } add_basis_grp2ppl($lesgroep_id, $leerling_id, $file_id); } } } lock_renew_helper(2, $done / $total); mdb2_exec(<<<EOT INSERT INTO grp2grp ( lesgroep_id, lesgroep2_id, file_id_basis ) SELECT DISTINCT grp2ppl.lesgroep_id, grp2ppl2.lesgroep_id, grp2ppl.file_id_basis FROM grp2ppl JOIN grp2ppl AS grp2ppl2 ON grp2ppl.ppl_id = grp2ppl2.ppl_id AND grp2ppl.file_id_basis = grp2ppl2.file_id_basis WHERE grp2ppl.file_id_basis = {$file_id} EOT ); lock_renew_helper(2, $done / $total); foreach ($udmz['Les'] as $id => $row) { incdone($done, $total, 2); if (!checkset($row, 'Les', array('#WijzigComment', 'Dag', 'Uur', 'Vak', 'Grp', 'Doc', 'Lok'))) { return; } if (!($zermelo_id = add_zermelo_id($id))) { return; } insert_les(',', $zermelo_id, $row['Dag'], $row['Uur'], $row['Vak'], $row['Grp'], $row['Doc'], $row['Lok'], $file_id, $row['#WijzigComment']); } // als we hier zijn, dan is alles goed gegaan mdb2_exec("UPDATE files SET file_status = 1 WHERE file_id = {$file_id}"); lock_renew_helper(3, $done / $total); }
//FIXME. HACK by Lucas, to get the data from Contact, since the TH for this data in EURES pages is not well-written, causing the extractor not to work. e.g.<th colspan="1>Contact:</th> default: $contact = trim(str_replace("</td>", "", $text)); $contact = str_replace("'", "\\'", $contact); break; } } } ## CLEANING SOME DATA ## if (isset($salary['currency']) && !isset($salary_currency)) { $salary_currency = $salary['currency']; $salary_currency_id = insert_name('salary_currency', $salary_currency); } if (isset($salary['currency']) && !isset($salary_period)) { $salary_period = $salary['period']; $salary_period_id = insert_name('salary_period', $salary_period); } if (preg_match('/www./', $address)) { $explode_address = explode(",", $address); $i = 0; while ($i < sizeof($explode_address)) { if (preg_match('/www./', $explode_address[$i])) { $homepage = $explode_address[$i]; } $i++; } } if (preg_match('/www./', $information)) { $explode_information = explode(" ", $information); $i = 0; while ($i < sizeof($explode_information)) {