function _scrapeLexPage($html, $url) { global $vocab; global $rdfData; $dom = new simple_html_dom(); $dom->load($html); $h2a = $dom->find("h2[@class='headline'] a"); $href = $h2a[0]->href; $id = substr($href, 0, strrpos($href, '.')); $fullURL = substr($url, 0, strrpos($url, '/')) . '/' . $href; $dom->__destruct(); $uri = URI_BASE . $id; $html = _getHTML($fullURL); $dom = new simple_html_dom(); $dom->load($html); // x a http://www.metalex.eu/metalex/2008-05-02#BibliographicWork // dct:title, dct:description, metalex:fragment/fragmentOf, rdfs:seeAlso, rdfs:label, dcterms:source // x rdf:type metalex:BibliographicWork $rdfData[$uri] = array(RDF_TYPE => array(array('type' => 'uri', 'value' => METALEX_BIBWORK))); // title, label $h1Element = $dom->find("div[@id='paddingLR12'] div[@class='jnheader'] h1"); $title = html_entity_decode($h1Element[0]->plaintext, ENT_COMPAT, 'ISO-8859-1'); $rdfData[$uri][$vocab['dct'] . 'title'] = array(array('type' => 'literal', 'value' => $title, 'lang' => 'de')); $rdfData[$uri][$vocab['rdfs'] . 'label'] = array(array('type' => 'literal', 'value' => $title, 'lang' => 'de')); // token $pElems = $dom->find("div[@id='paddingLR12'] div[@class='jnheader'] p"); if (isset($pElems[0])) { $token = html_entity_decode($pElems[0]->plaintext, ENT_COMPAT, 'ISO-8859-1'); $rdfData[$uri][$vocab['nlex'] . 'token'] = array(array('type' => 'literal', 'value' => $token)); } // seeAlso $rdfData[$uri][$vocab['rdfs'] . 'seeAlso'] = array(array('type' => 'uri', 'value' => $fullURL)); $i = 0; $currentChapterURI = null; $jnNorms = $dom->find("div[@id='paddingLR12'] div[@class='jnnorm']"); foreach ($jnNorms as $data) { if ($i === 0) { // We skip the first for now, since it contains only header info ++$i; continue; } ++$i; $result = false; // If the text contains a h3, it is a fragment $h3 = $data->find("div[@class='jnheader'] h3 span"); if ($h3) { if (null !== $currentChapterURI) { $result = _handleFragmentForURI($data, $currentChapterURI, $fullURL); } else { $result = _handleFragmentForURI($data, $uri, $fullURL); } } else { // If we find a h2 this is a chapter $h2 = $data->find("div[@class='jnheader'] h2"); if ($h2) { $currentChapterURI = _handleChapterForURI($data, $uri, $fullURL); $result = true; // special case... } else { return false; } } if (!$result) { return false; } } $dom->__destruct(); return true; }
function _handleDetailPage($url, $result) { $html = _getHTML($url); $dom = new simple_html_dom(); $dom->load($html); $divBoxes = $dom->find("div[@class='featured_box margin_top_fb']"); if (count($divBoxes) < 1) { return $result; } $style = $divBoxes[0]->style; $styleParts = explode("'", $style); $imageURL = EUROPA_URL_BASE . $styleParts[1]; $result['logoURL'] = $imageURL; $addressText = $divBoxes[0]->xmltext; $addressTextParts = explode('</h3>', $addressText); if (count($addressTextParts) !== 2) { return $result; } $addressText = $addressTextParts[1]; $addressText = str_replace('<br />', '<br>', $addressText); $addressTextParts = explode('<br>', $addressText); if (count($addressTextParts) < 3) { return $result; } $street = trim($addressTextParts[0]); $matches = array(); $curPos = 1; preg_match('/^.*[0-9]+.*$/', $street, $matches); if (count($matches) === 0) { $street .= ' ' . trim($addressTextParts[$curPos++]); } $result['zipCity'] = trim($addressTextParts[$curPos++]); $result['country'] = trim($addressTextParts[$curPos++]); for ($i = $curPos; $i < count($addressTextParts); ++$i) { $val = strtolower(trim($addressTextParts[$i])); if (substr($val, 0, 4) === 'tel:') { $result['tel'] = trim(substr($val, 4)); } else { if (substr($val, 0, 4) === 'fax:') { $result['fax'] = trim(substr($val, 4)); } else { if (substr($val, 0, 2) === '<a') { $parts = explode('"', $val); $result['mailto'] = trim($parts[1]); } } } } $links = array(); $aElements = $dom->find("div[@id='euCenter'] a"); foreach ($aElements as $a) { if (strpos($a->href, 'http://') !== false) { $links[] = $a->href; } } if (count($links) > 0) { if (isset($result['seeAlso'])) { $result['seeAlso'] = array_merge($result['seeAlso'], $links); } else { $result['seeAlso'] = $links; } } $dom->__destruct(); return $result; }