function scrape() { foreach ($this->xpath->query('//td/table//tr/td/table//tr/td/ul/li/a') as $el) { $linkText = $el->textContent; $linkHref = $el->getAttribute('href'); $scraper = new InstitutionScraper('http://europa.eu/whoiswho/public/' . $linkHref); $scraper->setSlugPrefix('agencies'); $scraper->scrape(); $this->flushNtriples(); } }
function scrape() { $this->translateLabelsOnPage(WHOISWHO_INSTITUTION_PAGE_LINKS); global $scrapedNodes; global $publicInstitutionsGraph; foreach ($this->xpath->query(WHOISWHO_INSTITUTION_PAGE_LINKS) as $link) { $webPageUri = str_replace('lang=en', '', 'http://europa.eu/whoiswho/public/' . $link->getAttribute('href')); $nodeId = $this->getNodeIdFromUrl($webPageUri); if (in_array($nodeId, $scrapedNodes)) { continue; } else { $scrapedNodes[] = $nodeId; } $label = $link->textContent; if ($label == '') { } else { if (!strpos($webPageUri, 'nodeID=18&')) { $uri = INST . 'institutions/' . $nodeId; $type = $this->choose_type($label); $this->add_resource($uri, $type, $label, 'en-gb'); $this->graph->add_resource_triple($uri, FOAF_PAGE, $webPageUri); $sameAs = array_shift($publicInstitutionsGraph->get_subjects_where_literal(DCT . 'title', $label)); if ($sameAs) { $this->graph->add_resource_triple($uri, OWL_SAMEAS, $sameAs); } $InstitutionScraper = new InstitutionScraper($webPageUri . '&lang=en'); $InstitutionScraper->setTopLevelInstitution($uri); //$InstitutionScraper->translate_langs = array('fr','de','it','nl', 'es', 'no', 'da','bg','pl','fi'); // $InstitutionScraper->setSlugPrefix(urlize($label)); $InstitutionScraper->scrape($uri, ORG . 'hasSubOrganization', ORG . 'transitiveSubOrganisationOf'); $this->graph->add_graph($InstitutionScraper->get_graph()); } else { if (strpos($webPageUri, 'nodeID=18&')) { //Agencies and other bodies //scrape agencies $AgenciesScraper = new AgenciesAndOtherBodiesScraper($webPageUri); $AgenciesScraper->translate_langs = array('fr', 'de', 'it', 'nl', 'es', 'no', 'da', 'bg', 'pl', 'fi'); $AgenciesScraper->scrape(); } } } $this->flushNtriples(); } }
function scrape($linkSubject = false, $linkPredicate = false, $inverseLinkPredicate = false) { global $scrapedNodes; global $publicInstitutionsGraph; $this->translateLabelsOnPage(WHOISWHO_SUB_ORGANISATION_LINKS_XPATH); if ($linkSubject) { $uri = $linkSubject; $address = ''; foreach ($this->xpath->query("//node()[preceding-sibling::h3][following::br]") as $detailsTextLine) { $address .= $this->parseAddressDetails($detailsTextLine, $uri); } if (!empty($address)) { $this->graph->add_literal_triple($uri, OV . 'postalAddress', trim($address)); } } $subOrganisations = $this->xpath->query(WHOISWHO_SUB_ORGANISATION_LINKS_XPATH); foreach ($subOrganisations as $link) { $nodeId = $this->getNodeIdFromUrl($link->getAttribute('href')); if (in_array($nodeId, $scrapedNodes)) { $this->log_message("Already scraped node {$nodeId}"); continue; } else { $scrapedNodes[] = $nodeId; } $webPageUri = str_replace('&lang=en', '', 'http://europa.eu/whoiswho/public/' . $link->getAttribute('href')); $label = $link->textContent; $uri = INST . $this->slugPrefix . '/' . $nodeId; //urlize($label); $type = $this->choose_type($label); $this->add_resource($uri, $type, $label, 'en-gb'); $this->graph->add_resource_triple($uri, FOAF_PAGE, $webPageUri); $this->graph->add_resource_triple($uri, ORG . 'transitiveSubOrganisationOf', $this->topLevelInstitution); if ($this->getNodeIdFromUrl($this->pageUri) != 4180) { $sameAs = array_shift($publicInstitutionsGraph->get_subjects_where_literal(DCT . 'title', $label)); if ($sameAs) { $this->graph->add_resource_triple($uri, OWL_SAMEAS, $sameAs); $this->graph->remove_property_values($uri, RDF_TYPE); foreach ($publicInstitutionsGraph->get_resource_triple_values($sameAs, RDF_TYPE) as $type) { $this->graph->add_resource_triple($uri, RDF_TYPE, $type); } } } if ($linkSubject && $linkPredicate) { $this->graph->add_resource_triple($linkSubject, $linkPredicate, $uri); } if ($linkSubject && $inverseLinkPredicate) { $this->graph->add_resource_triple($uri, $inverseLinkPredicate, $linkSubject); } $InstitutionScraper = new InstitutionScraper($webPageUri); $InstitutionScraper->setTopLevelInstitution($this->topLevelInstitution); $InstitutionScraper->scrape($uri, ORG . 'hasSubOrganization', ORG . 'subOrganisationOf'); //$this->graph->add_graph($InstitutionScraper->get_graph()); $this->flushNtriples(); } $thisNodeId = $this->getNodeIdFromUrl($this->pageUri); foreach ($this->xpath->query("//a[contains(@href, 'personID=')]") as $a) { global $scrapedPeople; $personId = $this->getPersonIdFromUrl($a->getAttribute('href')); if (in_array($personId, $scrapedPeople)) { $this->log_message("Already scraped person {$personId}"); continue; } else { $scrapedPeople[] = $personId; $webPageUri = str_replace('&lang=en', '', 'http://europa.eu/whoiswho/public/' . $a->getAttribute('href')); $webPageUri = str_replace('&nodeID=' . $thisNodeId, '', $webPageUri); $personScraper = new EUPersonScraper($webPageUri); $personScraper->scrape(); } } $this->flushNtriples(); }
<?php set_include_path(get_include_path() . ':../'); require 'scrapewhoiswho.php'; $url = 'http://europa.eu/whoiswho/public/index.cfm?fuseaction=idea.hierarchy&nodeID=54023&lang=en'; $scraper = new InstitutionScraper($url, $publicInstitutionsGraph); $scraper->scrape();