Ejemplo n.º 1
0
        echo 'Parsing index page: ' . $indexPageSpec['title'] . PHP_EOL;
        $url = $indexPageSpec['url'];
        $lexURLs = _scrapeIndexPage($url);
        $allLexURLs = array_merge($allLexURLs, $lexURLs);
    }
    echo 'Found ' . count($allLexURLs) . ' lex pages.' . PHP_EOL;
    file_put_contents($cacheFilename, serialize($allLexURLs));
} else {
    $allLexURLs = unserialize(file_get_contents($cacheFilename));
}
$append = false;
foreach ($allLexURLs as $lexURL) {
    echo 'Handling: ' . $lexURL['title'] . PHP_EOL;
    $html = _getHTML($lexURL['url']);
    _scrapeLexPage($html, $lexURL['url']);
    _exportRDF($rdfData, $append);
    $append = true;
    $rdfData = array();
    // reset
}
echo PHP_EOL . "DONE" . PHP_EOL;
//
// Functions
//
function _scrapeLexPage($html, $url)
{
    global $vocab;
    global $rdfData;
    $dom = new simple_html_dom();
    $dom->load($html);
    $h2a = $dom->find("h2[@class='headline'] a");
Ejemplo n.º 2
0
$append = false;
foreach ($files as $file) {
    echo "Handling file: {$file}" . PHP_EOL;
    $xmlDocString = file_get_contents("tmp/{$file}");
    $i = 0;
    while (true) {
        $pos = strpos($xmlDocString, $xmlDeclString, 1);
        if ($pos === false) {
            break;
        }
        $xmlString = substr($xmlDocString, 0, $pos);
        $xmlDocString = substr($xmlDocString, $pos + strlen($xmlDeclString));
        $xml = simplexml_load_string($xmlString);
        $result = _parseEPBulletin($xml);
        echo "\rWriting triples for document number: " . $i++;
        _exportRDF($result, $append);
        $append = true;
    }
    echo PHP_EOL;
}
echo "Writing schema triples now..." . PHP_EOL;
_writeSchemaTriples();
##### Functions
function _parseEPBulletin($xml)
{
    $xml = (array) $xml;
    $attributes = $xml['@attributes'];
    $sdobi = (array) $xml['SDOBI'];
    $id = $attributes['id'];
    $file = $attributes['file'];
    $lang = $attributes['lang'];
Ejemplo n.º 3
0
foreach ($countries as $country => $subRegion) {
    $countryURI = URI_BASE . 'country/' . urlencode($country);
    $subRegionURI = URI_BASE . 'subregion/' . urlencode($subRegion);
    $rdfData[$countryURI] = array(RDF_TYPE => array(array('type' => 'uri', 'value' => $vocab['unodc'] . 'Country')), $vocab['rdfs'] . 'label' => array(array('type' => 'literal', 'value' => $country)), $vocab['unodc'] . 'locatedInSubRegion' => array(array('type' => 'uri', 'value' => $subRegionURI)));
}
for ($i = $startYear; $i <= $endYear; ++$i) {
    $yearURI = URI_BASE . 'year/' . $i;
    $rdfData[$yearURI] = array(RDF_TYPE => array(array('type' => 'uri', 'value' => $vocab['unodc'] . 'Year')), $vocab['rdfs'] . 'label' => array(array('type' => 'literal', 'value' => $i)));
}
_exportRDF($rdfData, true);
// schema
$rdfData = array();
foreach ($propURIs as $uri => $label) {
    $rdfData[$uri] = array(RDF_TYPE => array(array('type' => 'uri', 'value' => $vocab['qb'] . 'MeasureProperty')), $vocab['rdfs'] . 'label' => array(array('type' => 'literal', 'value' => $label)));
}
_exportRDF($rdfData, false, 'schema');
// Functions
function _handleDataset($fileName)
{
    global $startYear;
    global $endYear;
    global $vocab;
    global $regions;
    global $subRegions;
    global $countries;
    global $propURIs;
    $data = new Spreadsheet_Excel_Reader($fileName);
    $datasetName = str_replace('data/', '', $fileName);
    $datasetName = str_replace('.xls', '', $fileName);
    $datasetComment = null;
    $datasetURI = URI_BASE . 'dataset/' . urlencode($datasetName);
Ejemplo n.º 4
0
_exportRDF($rdfData, false);
$rdfData = array();
foreach ($countries as $uri => $c) {
    $rdfData[$uri] = array(RDF_TYPE => array(array('type' => 'uri', 'value' => $vocab['agencies'] . 'Country')), $vocab['dct'] . 'title' => array(array('type' => 'literal', 'value' => $c['title'], 'lang' => 'en')));
    foreach ($otherLanguages as $langCode) {
        if (isset($c["title_{$langCode}"])) {
            $rdfData[$uri][$vocab['dct'] . 'title'][] = array('type' => 'literal', 'value' => $c["title_{$langCode}"], 'lang' => $langCode);
        }
    }
}
_exportRDF($rdfData, false, 'countries');
$rdfData = array();
foreach ($cities as $uri => $c) {
    $rdfData[$uri] = array(RDF_TYPE => array(array('type' => 'uri', 'value' => $vocab['agencies'] . 'City')), $vocab['dct'] . 'title' => array(array('type' => 'literal', 'value' => $c['title'], 'lang' => 'en')), $vocab['agencies'] . 'locatedInCountry' => array(array('type' => 'uri', 'value' => $c['country'])));
}
_exportRDF($rdfData, false, 'cities');
echo 'DONE!' . PHP_EOL;
exit;
var_dump($agenciesByTitle);
exit;
function _handleRecruitmentPage()
{
    $url = 'http://europa.eu/agencies/recruitment/index_en.htm';
    $html = _getHTML($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $ulElements = $dom->find("div[@class='agencies_list'] ul");
    foreach ($ulElements as $ul) {
        $liElements = $ul->find("li");
        foreach ($liElements as $li) {
            _handleAgencyListItem($li);