Beispiel #1
0
function _scrapeLexPage($html, $url)
{
    global $vocab;
    global $rdfData;
    $dom = new simple_html_dom();
    $dom->load($html);
    $h2a = $dom->find("h2[@class='headline'] a");
    $href = $h2a[0]->href;
    $id = substr($href, 0, strrpos($href, '.'));
    $fullURL = substr($url, 0, strrpos($url, '/')) . '/' . $href;
    $dom->__destruct();
    $uri = URI_BASE . $id;
    $html = _getHTML($fullURL);
    $dom = new simple_html_dom();
    $dom->load($html);
    // x a http://www.metalex.eu/metalex/2008-05-02#BibliographicWork
    // dct:title, dct:description, metalex:fragment/fragmentOf, rdfs:seeAlso, rdfs:label, dcterms:source
    // x rdf:type metalex:BibliographicWork
    $rdfData[$uri] = array(RDF_TYPE => array(array('type' => 'uri', 'value' => METALEX_BIBWORK)));
    // title, label
    $h1Element = $dom->find("div[@id='paddingLR12'] div[@class='jnheader'] h1");
    $title = html_entity_decode($h1Element[0]->plaintext, ENT_COMPAT, 'ISO-8859-1');
    $rdfData[$uri][$vocab['dct'] . 'title'] = array(array('type' => 'literal', 'value' => $title, 'lang' => 'de'));
    $rdfData[$uri][$vocab['rdfs'] . 'label'] = array(array('type' => 'literal', 'value' => $title, 'lang' => 'de'));
    // token
    $pElems = $dom->find("div[@id='paddingLR12'] div[@class='jnheader'] p");
    if (isset($pElems[0])) {
        $token = html_entity_decode($pElems[0]->plaintext, ENT_COMPAT, 'ISO-8859-1');
        $rdfData[$uri][$vocab['nlex'] . 'token'] = array(array('type' => 'literal', 'value' => $token));
    }
    // seeAlso
    $rdfData[$uri][$vocab['rdfs'] . 'seeAlso'] = array(array('type' => 'uri', 'value' => $fullURL));
    $i = 0;
    $currentChapterURI = null;
    $jnNorms = $dom->find("div[@id='paddingLR12'] div[@class='jnnorm']");
    foreach ($jnNorms as $data) {
        if ($i === 0) {
            // We skip the first for now, since it contains only header info
            ++$i;
            continue;
        }
        ++$i;
        $result = false;
        // If the text contains a h3, it is a fragment
        $h3 = $data->find("div[@class='jnheader'] h3 span");
        if ($h3) {
            if (null !== $currentChapterURI) {
                $result = _handleFragmentForURI($data, $currentChapterURI, $fullURL);
            } else {
                $result = _handleFragmentForURI($data, $uri, $fullURL);
            }
        } else {
            // If we find a h2 this is a chapter
            $h2 = $data->find("div[@class='jnheader'] h2");
            if ($h2) {
                $currentChapterURI = _handleChapterForURI($data, $uri, $fullURL);
                $result = true;
                // special case...
            } else {
                return false;
            }
        }
        if (!$result) {
            return false;
        }
    }
    $dom->__destruct();
    return true;
}
Beispiel #2
0
function _handleDetailPage($url, $result)
{
    $html = _getHTML($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $divBoxes = $dom->find("div[@class='featured_box margin_top_fb']");
    if (count($divBoxes) < 1) {
        return $result;
    }
    $style = $divBoxes[0]->style;
    $styleParts = explode("'", $style);
    $imageURL = EUROPA_URL_BASE . $styleParts[1];
    $result['logoURL'] = $imageURL;
    $addressText = $divBoxes[0]->xmltext;
    $addressTextParts = explode('</h3>', $addressText);
    if (count($addressTextParts) !== 2) {
        return $result;
    }
    $addressText = $addressTextParts[1];
    $addressText = str_replace('<br />', '<br>', $addressText);
    $addressTextParts = explode('<br>', $addressText);
    if (count($addressTextParts) < 3) {
        return $result;
    }
    $street = trim($addressTextParts[0]);
    $matches = array();
    $curPos = 1;
    preg_match('/^.*[0-9]+.*$/', $street, $matches);
    if (count($matches) === 0) {
        $street .= ' ' . trim($addressTextParts[$curPos++]);
    }
    $result['zipCity'] = trim($addressTextParts[$curPos++]);
    $result['country'] = trim($addressTextParts[$curPos++]);
    for ($i = $curPos; $i < count($addressTextParts); ++$i) {
        $val = strtolower(trim($addressTextParts[$i]));
        if (substr($val, 0, 4) === 'tel:') {
            $result['tel'] = trim(substr($val, 4));
        } else {
            if (substr($val, 0, 4) === 'fax:') {
                $result['fax'] = trim(substr($val, 4));
            } else {
                if (substr($val, 0, 2) === '<a') {
                    $parts = explode('"', $val);
                    $result['mailto'] = trim($parts[1]);
                }
            }
        }
    }
    $links = array();
    $aElements = $dom->find("div[@id='euCenter'] a");
    foreach ($aElements as $a) {
        if (strpos($a->href, 'http://') !== false) {
            $links[] = $a->href;
        }
    }
    if (count($links) > 0) {
        if (isset($result['seeAlso'])) {
            $result['seeAlso'] = array_merge($result['seeAlso'], $links);
        } else {
            $result['seeAlso'] = $links;
        }
    }
    $dom->__destruct();
    return $result;
}