echo 'Parsing index page: ' . $indexPageSpec['title'] . PHP_EOL; $url = $indexPageSpec['url']; $lexURLs = _scrapeIndexPage($url); $allLexURLs = array_merge($allLexURLs, $lexURLs); } echo 'Found ' . count($allLexURLs) . ' lex pages.' . PHP_EOL; file_put_contents($cacheFilename, serialize($allLexURLs)); } else { $allLexURLs = unserialize(file_get_contents($cacheFilename)); } $append = false; foreach ($allLexURLs as $lexURL) { echo 'Handling: ' . $lexURL['title'] . PHP_EOL; $html = _getHTML($lexURL['url']); _scrapeLexPage($html, $lexURL['url']); _exportRDF($rdfData, $append); $append = true; $rdfData = array(); // reset } echo PHP_EOL . "DONE" . PHP_EOL; // // Functions // function _scrapeLexPage($html, $url) { global $vocab; global $rdfData; $dom = new simple_html_dom(); $dom->load($html); $h2a = $dom->find("h2[@class='headline'] a");
$append = false; foreach ($files as $file) { echo "Handling file: {$file}" . PHP_EOL; $xmlDocString = file_get_contents("tmp/{$file}"); $i = 0; while (true) { $pos = strpos($xmlDocString, $xmlDeclString, 1); if ($pos === false) { break; } $xmlString = substr($xmlDocString, 0, $pos); $xmlDocString = substr($xmlDocString, $pos + strlen($xmlDeclString)); $xml = simplexml_load_string($xmlString); $result = _parseEPBulletin($xml); echo "\rWriting triples for document number: " . $i++; _exportRDF($result, $append); $append = true; } echo PHP_EOL; } echo "Writing schema triples now..." . PHP_EOL; _writeSchemaTriples(); ##### Functions function _parseEPBulletin($xml) { $xml = (array) $xml; $attributes = $xml['@attributes']; $sdobi = (array) $xml['SDOBI']; $id = $attributes['id']; $file = $attributes['file']; $lang = $attributes['lang'];
foreach ($countries as $country => $subRegion) { $countryURI = URI_BASE . 'country/' . urlencode($country); $subRegionURI = URI_BASE . 'subregion/' . urlencode($subRegion); $rdfData[$countryURI] = array(RDF_TYPE => array(array('type' => 'uri', 'value' => $vocab['unodc'] . 'Country')), $vocab['rdfs'] . 'label' => array(array('type' => 'literal', 'value' => $country)), $vocab['unodc'] . 'locatedInSubRegion' => array(array('type' => 'uri', 'value' => $subRegionURI))); } for ($i = $startYear; $i <= $endYear; ++$i) { $yearURI = URI_BASE . 'year/' . $i; $rdfData[$yearURI] = array(RDF_TYPE => array(array('type' => 'uri', 'value' => $vocab['unodc'] . 'Year')), $vocab['rdfs'] . 'label' => array(array('type' => 'literal', 'value' => $i))); } _exportRDF($rdfData, true); // schema $rdfData = array(); foreach ($propURIs as $uri => $label) { $rdfData[$uri] = array(RDF_TYPE => array(array('type' => 'uri', 'value' => $vocab['qb'] . 'MeasureProperty')), $vocab['rdfs'] . 'label' => array(array('type' => 'literal', 'value' => $label))); } _exportRDF($rdfData, false, 'schema'); // Functions function _handleDataset($fileName) { global $startYear; global $endYear; global $vocab; global $regions; global $subRegions; global $countries; global $propURIs; $data = new Spreadsheet_Excel_Reader($fileName); $datasetName = str_replace('data/', '', $fileName); $datasetName = str_replace('.xls', '', $fileName); $datasetComment = null; $datasetURI = URI_BASE . 'dataset/' . urlencode($datasetName);
_exportRDF($rdfData, false); $rdfData = array(); foreach ($countries as $uri => $c) { $rdfData[$uri] = array(RDF_TYPE => array(array('type' => 'uri', 'value' => $vocab['agencies'] . 'Country')), $vocab['dct'] . 'title' => array(array('type' => 'literal', 'value' => $c['title'], 'lang' => 'en'))); foreach ($otherLanguages as $langCode) { if (isset($c["title_{$langCode}"])) { $rdfData[$uri][$vocab['dct'] . 'title'][] = array('type' => 'literal', 'value' => $c["title_{$langCode}"], 'lang' => $langCode); } } } _exportRDF($rdfData, false, 'countries'); $rdfData = array(); foreach ($cities as $uri => $c) { $rdfData[$uri] = array(RDF_TYPE => array(array('type' => 'uri', 'value' => $vocab['agencies'] . 'City')), $vocab['dct'] . 'title' => array(array('type' => 'literal', 'value' => $c['title'], 'lang' => 'en')), $vocab['agencies'] . 'locatedInCountry' => array(array('type' => 'uri', 'value' => $c['country']))); } _exportRDF($rdfData, false, 'cities'); echo 'DONE!' . PHP_EOL; exit; var_dump($agenciesByTitle); exit; function _handleRecruitmentPage() { $url = 'http://europa.eu/agencies/recruitment/index_en.htm'; $html = _getHTML($url); $dom = new simple_html_dom(); $dom->load($html); $ulElements = $dom->find("div[@class='agencies_list'] ul"); foreach ($ulElements as $ul) { $liElements = $ul->find("li"); foreach ($liElements as $li) { _handleAgencyListItem($li);