Ejemplo n.º 1
0
function scraper($url_search, $country_id)
{
    $has_next = false;
    $base_url = "http://ec.europa.eu/eures/eures-searchengine/servlet";
    $html = scraperwiki::scrape($url_search);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find('table[class=JResult]') as $result) {
        foreach ($result->find('td[class=JRTitle] a') as $job_page) {
            $chars = explode("'", $job_page->onclick);
            $url_job = $base_url . substr($chars[1], 1);
            $url_id = strstr($url_job, 'uniqueJvId=');
            $url_id = str_replace('uniqueJvId=', "", $url_id);
            echo "JOB: " . $url_job . "<br />";
        }
        foreach ($result->find('th') as $data) {
            $text = trim($data->plaintext);
            if ($text == 'Description:') {
                $description = trim($data->next_sibling()->plaintext);
                echo "DESCRIPTION: " . $description . "<br />";
            }
            if ($text == 'Source:') {
                $source = trim($data->next_sibling()->plaintext);
                $source = str_replace("'", "\\'", $source);
                if ($source != '' && $source != '&nbsp;') {
                    $source_id = insert_name('source', $source);
                    echo "SOURCE: " . $source . "<br /><br />";
                }
            }
        }
        $description = str_replace("'", "\\'", $description);
        $description = str_replace("</BR>", "", $description);
        $sql = mysql_query("SELECT * FROM job WHERE url = '{$url_job}'");
        $cont = mysql_num_rows($sql);
        if ($cont == 0) {
            mysql_query("INSERT INTO job SET \n\t\t\t\t\turl = '{$url_job}', \n\t\t\t\t\turl_id = '{$url_id}', \n\t\t\t\t\tdescription = '{$description}', \n\t\t\t\t\tsource_id = '{$source_id}', \n\t\t\t\t\turl_search = '{$url_search}', \n\t\t\t\t\tcountry_id='{$country_id}',\n\t\t\t\t\turl_scraper_date = SYSDATE(),\t \n\t\t\t\t\turl_scraper_hour = SYSDATE()");
        } else {
            echo "Job URL already extracted: " . $url_job . "<br /><br />";
        }
    }
    foreach ($dom->find('div[class=prevNext] a') as $next_page) {
        $text = $next_page->plaintext;
        if ($text == "Next page") {
            $url_next = substr($next_page->href, 1);
            $url_next = $base_url . $url_next;
            $has_next = true;
            print "<br /><br />NEXT: " . $url_next . "<br /><br />";
        }
    }
    unset($html, $dom, $result, $job_page, $data, $next_page, $text, $url_id, $url_job, $description, $source, $source_id, $url_search);
    //Comment this for tests, uncomment this to get all data
    //	if ($has_next == true){
    //		sleep(1);
    //		scraper($url_next, $country_id);
    //	}
}
Ejemplo n.º 2
0
 case 'Phone:':
     $phone = $value;
     break;
 case 'Email:':
     $email = $value;
     break;
 case 'Fax:':
     $fax = $value;
     $fax_array = format_fax($fax);
     break;
     //Application
 //Application
 case 'How to apply:':
     if (strlen($value) > 0) {
         $how_to_apply = $value;
         $how_to_apply_id = insert_name('how_to_apply', $how_to_apply);
     }
     break;
 case 'Contact:':
     $contact = $value;
     break;
 case 'Last date for application:':
     $last_date_for_application = $value;
     break;
     //Other Information
 //Other Information
 case 'Date published:':
     $date_published = $value;
     break;
 case 'National reference:':
     $national_reference = $value;
Ejemplo n.º 3
0
function import_basisrooster($file_id, $tmp_name)
{
    global $grp2ppl, $stamz;
    lock_renew_helper(1);
    $udmz = read_udmz_file($tmp_name);
    // a previous update may have gone wrong, cleanup just in case
    mdb2_exec("DELETE FROM files2lessen WHERE file_id = {$file_id}");
    mdb2_exec("DELETE FROM grp2ppl WHERE file_id_basis = {$file_id}");
    mdb2_exec("DELETE FROM grp2grp WHERE file_id_basis = {$file_id}");
    // eerst lopen we alle leerlingen langs
    // dan alle docenten
    // dan alle groepen
    // en als laatste: alle lessen
    if (!checkset($udmz, 'udmz file', array('Groep', 'Leerling', 'Docent', 'Les'))) {
        return;
    }
    $leerlingen = 0;
    $categorieen_leerling = 0;
    foreach ($udmz['Leerling'] as $category => $value) {
        if (in_array($category, config('ZERMELO_CATEGORY_IGNORE'))) {
            continue;
        }
        $categorieen_leerling++;
        $leerlingen += count($value);
    }
    $docenten = count($udmz['Docent']);
    $groepen = 0;
    $categorieen_groep = 0;
    foreach ($udmz['Groep'] as $category => $value) {
        if (in_array($category, config('ZERMELO_CATEGORY_IGNORE'))) {
            continue;
        }
        $categorieen_groep++;
        foreach ($value as $id => $row) {
            if (in_array($id, config('ZERMELO_GROUP_IGNORE'))) {
                continue;
            }
            $groepen++;
        }
    }
    $lessen = count($udmz['Les']);
    /*
    logit('Categorieen leerling: '.$categorieen_leerling);
    logit('Leerlingen: '.$leerlingen);
    logit('Docenten: '.$docenten);
    logit('Categorieen groep: '.$categorieen_groep);
    logit('Groepen: '.$groepen);
    logit('Lessen: '.$lessen);
    */
    $total = $categorieen_leerling + $leerlingen + $categorieen_groep + $groepen + $docenten + $lessen;
    $done = 0;
    lock_renew_helper(2, $done / $total);
    foreach ($udmz['Leerling'] as $category => $list) {
        if (in_array($category, config('ZERMELO_CATEGORY_IGNORE'))) {
            continue;
        }
        incdone($done, $total, 2);
        if (!($category_id = add_entity($category, CATEGORIE))) {
            return;
        }
        foreach ($list as $id => $row) {
            incdone($done, $total, 2);
            if (!checkset($row, "Leerling.{$category}", array('LASTNAME', 'FIRSTNAME', 'BETWEENNAME', 'BASICCLASS'))) {
                return;
            }
            if (!($leerling_id = add_entity($id, LEERLING))) {
                return;
            }
            insert_name($leerling_id, $row['FIRSTNAME'], $row['BETWEENNAME'], $row['LASTNAME']);
            if (!($lesgroep_id = add_entity($row['BASICCLASS'], STAMKLAS))) {
                return;
            }
            $stamz[$row['BASICCLASS']] = $category;
            add_basis_grp2ppl($lesgroep_id, $leerling_id, $file_id);
            add_basis_grp2ppl($category_id, $leerling_id, $file_id);
        }
    }
    lock_renew_helper(2, $done / $total);
    foreach ($udmz['Docent'] as $id => $row) {
        incdone($done, $total, 2);
        if (!checkset($row, 'Docent', array('Voornaam', 'Tussenvoegsel', 'Achternaam', 'e-mail'))) {
            continue;
        }
        if (!($docent_id = add_entity($id, DOCENT))) {
            return;
        }
        if ($row['Achternaam'] != '' && $row['Voornaam'] != '') {
            insert_name($docent_id, substr($row['Voornaam'], 0, 1) . '.', $row['Tussenvoegsel'], $row['Achternaam'], $row['e-mail']);
        }
    }
    lock_renew_helper(2, $done / $total);
    foreach ($udmz['Groep'] as $category => $list) {
        if (in_array($category, config('ZERMELO_CATEGORY_IGNORE'))) {
            continue;
        }
        incdone($done, $total, 2);
        if (!($category_id = add_entity($category, CATEGORIE))) {
            return;
        }
        foreach ($list as $id => $row) {
            if (in_array($id, config('ZERMELO_GROUP_IGNORE'))) {
                continue;
            }
            incdone($done, $total, 2);
            if (isset($stamz[$id])) {
                if ($stamz[$id] != $category) {
                    logit('stamklas in andere categorie?!?!?');
                } else {
                    continue;
                }
                // doe niks, want stamklassen hebben we al
            }
            if (!checkset($row, "Groep.{$category}", array('SET'))) {
                return;
            }
            if (!($lesgroep_id = add_entity(config('IGNORE_BEFORE_DOT') ? $id : $category . '.' . $id, LESGROEP))) {
                return;
            }
            if ($row['SET'] == '') {
                continue;
            }
            // geen leerlingen in deze groep
            foreach (explode(',', $row['SET']) as $leerlingnummer) {
                if (!($leerling_id = add_entity($leerlingnummer, LEERLING))) {
                    return;
                }
                add_basis_grp2ppl($lesgroep_id, $leerling_id, $file_id);
            }
        }
    }
    lock_renew_helper(2, $done / $total);
    mdb2_exec(<<<EOT
INSERT INTO grp2grp ( lesgroep_id, lesgroep2_id, file_id_basis )
SELECT DISTINCT grp2ppl.lesgroep_id, grp2ppl2.lesgroep_id, grp2ppl.file_id_basis
FROM grp2ppl
JOIN grp2ppl AS grp2ppl2 ON grp2ppl.ppl_id = grp2ppl2.ppl_id AND grp2ppl.file_id_basis = grp2ppl2.file_id_basis
WHERE grp2ppl.file_id_basis = {$file_id}
EOT
);
    lock_renew_helper(2, $done / $total);
    foreach ($udmz['Les'] as $id => $row) {
        incdone($done, $total, 2);
        if (!checkset($row, 'Les', array('#WijzigComment', 'Dag', 'Uur', 'Vak', 'Grp', 'Doc', 'Lok'))) {
            return;
        }
        if (!($zermelo_id = add_zermelo_id($id))) {
            return;
        }
        insert_les(',', $zermelo_id, $row['Dag'], $row['Uur'], $row['Vak'], $row['Grp'], $row['Doc'], $row['Lok'], $file_id, $row['#WijzigComment']);
    }
    // als we hier zijn, dan is alles goed gegaan
    mdb2_exec("UPDATE files SET file_status = 1 WHERE file_id = {$file_id}");
    lock_renew_helper(3, $done / $total);
}
Ejemplo n.º 4
0
             //FIXME. HACK by Lucas, to get the data from Contact, since the TH for this data in EURES pages is not well-written, causing the extractor not to work. e.g.<th colspan="1>Contact:</th>
             default:
                 $contact = trim(str_replace("</td>", "", $text));
                 $contact = str_replace("'", "\\'", $contact);
                 break;
         }
     }
 }
 ## CLEANING SOME DATA ##
 if (isset($salary['currency']) && !isset($salary_currency)) {
     $salary_currency = $salary['currency'];
     $salary_currency_id = insert_name('salary_currency', $salary_currency);
 }
 if (isset($salary['currency']) && !isset($salary_period)) {
     $salary_period = $salary['period'];
     $salary_period_id = insert_name('salary_period', $salary_period);
 }
 if (preg_match('/www./', $address)) {
     $explode_address = explode(",", $address);
     $i = 0;
     while ($i < sizeof($explode_address)) {
         if (preg_match('/www./', $explode_address[$i])) {
             $homepage = $explode_address[$i];
         }
         $i++;
     }
 }
 if (preg_match('/www./', $information)) {
     $explode_information = explode(" ", $information);
     $i = 0;
     while ($i < sizeof($explode_information)) {