public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $foundCoordinates = array(); /* Main title */ if ($geoInfo = $this->extractGeoInfo($pageSource, true)) { $pageG = new geo_param($geoInfo); array_push($foundCoordinates, $pageG); $this->log('debug', "Found title entry '" . implode("|", $geoInfo) . "'"); } /* Coordinates provided in infobox formats */ $infoboxes = $this->getInfoboxes($pageSource); foreach ($infoboxes[1] as $box) { $boxProperties = $this->getBoxProperties($box, true); foreach (GeoExtractor::$knownInfoboxFormats as $format) { /* Initialize global defaults */ $pieces = array(null, 0, 0, 'N', null, 0, 0, 'E'); /* Apply template-specific NS/EW defaults */ if (isset($format[8])) { /* NS default */ $pieces[3] = $format[8]; } if (isset($format[9])) { /* EW default */ $pieces[7] = $format[9]; } /* Copy from template */ for ($i = 0; $i < count($format); $i++) { $formatString = urldecode($format[$i]); if ($formatString != "" && isset($boxProperties[strtolower($formatString)])) { /* German coordinates: Treat 'O' (Ost) as 'E' (East) */ if ($i == 7 && $boxProperties[strtolower($formatString)] == 'O') { $pieces[$i] = 'E'; } else { $pieces[$i] = $boxProperties[strtolower($formatString)]; } } } if (geo_param::is_lat($pieces[0]) && geo_param::is_long($pieces[4])) { $g = new geo_param($pieces); array_push($foundCoordinates, $g); $this->log('debug', "Found format '" . $format[0] . "'"); } } /* $this->knownInfoboxFormats */ /* * Look for coordinate tags inside the infobox * Used widely, e.g. airports (=> Los_Angeles_International_Airport) * These don't have to set the title, as they're in a first-level infobox * We could simply take all references, but the idea here is to filter out * cases where multiple locations are specified */ foreach (GeoExtractor::$knownTemplatePredicates as $coord) { if (isset($boxProperties[strtolower(urldecode($coord))]) && ($geoInfo = $this->extractGeoInfo($boxProperties[strtolower(urldecode($coord))]))) { $pageG = new geo_param($geoInfo); array_push($foundCoordinates, $pageG); $this->log('debug', "Found infobox entry '" . implode("|", $geoInfo) . "'"); } } } /* infoboxes */ $numResults = 0; foreach ($foundCoordinates as $g) { if ($g->is_valid()) { if (GeoExtractor::enablePreview) { ?> <iframe src="http://maps.google.com/?q=<?php echo $g->latdeg; ?> ,<?php echo $g->londeg; ?> &z=5" style="width: 1000px; height: 600px; border: none;" scrolling="no"> </iframe> <?php } /* Only process the first result */ if (++$numResults == 1) { $sqlParams = array("resource" => $pageID, "lang" => $this->language, "point" => (string) $g->latdeg . " " . (string) $g->londeg); /* Check whether this exact entry is present */ if ($this->batchExtraction) { $results = mysql_query($this->sqlParameterize(GeoExtractor::sqlCheckExact, $sqlParams), $this->dbSharedConnection); $row = mysql_fetch_assoc($results); $hasExact = $row['count'] != '0'; if ($hasExact) { $hasAny = true; } else { /* Check whether any entry is present */ $results = mysql_query($this->sqlParameterize(GeoExtractor::sqlCheckAny, $sqlParams), $this->dbSharedConnection); $row = mysql_fetch_assoc($results); $hasAny = $row['count'] != '0'; } } else { $hasExact = $hasAny = false; } if ($hasAny) { $this->log('debug', "Not generating geocoordinates because coordinates were previously generated for this resource"); } else { /* Store in results table for duplicate detection */ if ($this->batchExtraction) { mysql_query($this->sqlParameterize(GeoExtractor::sqlInsertPoint, $sqlParams), $this->dbSharedConnection); } /* Triple generation */ /* W3C Geospatial Vocabulary (GeoRSS) */ $result->addTriple($this->getPageURI(), RDFtriple::URI(GEORSS_POINT, false), RDFtriple::Literal((string) $g->latdeg . " " . (string) $g->londeg)); /* Basic Geo Vocabulary - only add them if there are no points for the resource so far; it'd be ambigous otherwise! */ if ($hasAny) { $this->log('debug', "Not generating W3C Basic geocoordinates because basic coordinates were previously generated for this resource"); } else { $result->addTriple($this->getPageURI(), RDFtriple::URI(WGS_LAT, false), RDFtriple::Literal((string) $g->latdeg, XS_FLOAT, NULL)); $result->addTriple($this->getPageURI(), RDFtriple::URI(WGS_LONG, false), RDFtriple::Literal((string) $g->londeg, XS_FLOAT, NULL)); } /* * Process additional attributes * See http://en.wikipedia.org/wiki/Wikipedia:WikiProject_Geographical_coordinates#Parameters * and http://de.wikipedia.org/wiki/Wikipedia:WikiProjekt_Georeferenzierung/Neue_Koordinatenvorlage#Parameter */ if (isset($g->attr['type'])) { if (isset(GeoExtractor::$typeToGeoNames[$g->attr['type']])) { $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_FEATURECLASS, false), RDFtriple::URI(GEONAMES_NS . GeoExtractor::$typeToGeoNames[$g->attr['type']][0])); if (isset(GeoExtractor::$typeToGeoNames[$g->attr['type']][1])) { $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_FEATURECODE, false), RDFtriple::URI(GEONAMES_NS . GeoExtractor::$typeToGeoNames[$g->attr['type']][0] . '.' . GeoExtractor::$typeToGeoNames[$g->attr['type']][1])); } } /* city(pop): City, town or village with specified population */ if (strtolower($g->attr['type'] == 'city') && isset($g->attr['arg:type'])) { $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_POPULATION, false), RDFtriple::Literal((string) $g->attr['arg:type'], XS_INTEGER, NULL)); } /* * landmark: Cultural landmark, building of special interest, tourist attration and other points of interest * Mapped to YAGO; could in theory use http://www.eionet.europa.eu/gemet/concept/8525 ("tourist facility"), * but it seems as if any POI not matching the above categories is tagged as a landmark * (=> "Google" etc.) */ if (strtolower($g->attr['type'] == 'landmark')) { $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(YAGO_LANDMARK, false)); } } /* type */ /* population */ if (isset($g->attr['pop'])) { $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_POPULATION, false), RDFtriple::Literal((string) $g->attr['pop'], XS_INTEGER, NULL)); } /* elevation in meters above sea level * TODO must be converted from sea level *to* wgs84 ellipsoid! */ /*if (isset($g->attr['elevation'])) { $result->addTriple( $this->getPageURI(), RDFtriple::URI("http://www.georss.org/georss/elev"), RDFtriple::Literal((string) $g->attr['elevation']); }*/ /* Diameter (m) to GeoRSS radius (m) */ if (isset($g->attr['dim'])) { $result->addTriple($this->getPageURI(), RDFtriple::URI(GEORSS_RADIUS, false), RDFtriple::Literal((string) ($g->attr['dim'] / 2), XS_DECIMAL, NULL)); } /* * region: ISO 3166-1 alpha-2 country code or ISO 3166-2 region code to GeoNames inCountry * Very unreliable, as it just sets a preferred map view, * i.e. Germany's and The Czech Republic's regions are set to "EN".... */ /*if (isset($g->attr['region'])) { $result->addTriple( $this->getPageURI(), RDFtriple::URI("http://www.geonames.org/ontology#inCountry"), RDFtriple::URI("http://www.geonames.org/countries/#". substr($g->attr['region'], 0, 2))); } */ } /* !$hasExact */ } /* first entry */ } /* is_valid */ } /* $foundCoordinates */ return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $foundCoordinates = array(); /* Main title */ if ($geoInfo = $this->extractGeoInfo($pageSource, true)) { $pageG = new geo_param($geoInfo); array_push($foundCoordinates, $pageG); if (GeoExtractor::enableDebug) { echo "<h3>Found title entry '" . implode("|", $geoInfo) . "'</h3>"; } } /* Coordinates provided in infobox formats */ $infoboxes = $this->getInfoboxes($pageSource); foreach ($infoboxes[1] as $box) { $boxProperties = $this->getBoxProperties($box, true); foreach ($this->knownInfoboxFormats as $format) { /* Initialize global defaults */ $pieces = array(null, 0, 0, 'N', null, 0, 0, 'E'); /* Apply template-specific NS/EW defaults */ if (isset($format[8])) { /* NS default */ $pieces[3] = $format[8]; } if (isset($format[9])) { /* EW default */ $pieces[7] = $format[9]; } /* Copy from template */ for ($i = 0; $i < count($format); $i++) { $formatString = urldecode($format[$i]); if ($formatString != "" && isset($boxProperties[strtolower($formatString)])) { /* German coordinates: Treat 'O' (Ost) as 'E' (East) */ if ($i == 7 && $boxProperties[strtolower($formatString)] == 'O') { $pieces[$i] = 'E'; } else { $pieces[$i] = $boxProperties[strtolower($formatString)]; } } } if (geo_param::is_lat($pieces[0]) && geo_param::is_long($pieces[4])) { $g = new geo_param($pieces); array_push($foundCoordinates, $g); if (GeoExtractor::enableDebug) { echo "<h3>Found format '" . $format[0] . "'</h3>"; } } } /* $this->knownInfoboxFormats */ /* * Look for coordinate tags inside the infobox * Used widely, e.g. airports (=> Los_Angeles_International_Airport) * These don't have to set the title, as they're in a first-level infobox * We could simply take all references, but the idea here is to filter out * cases where multiple locations are specified */ foreach ($this->knownTemplatePredicates as $coord) { if (isset($boxProperties[strtolower(urldecode($coord))]) && ($geoInfo = $this->extractGeoInfo($boxProperties[strtolower(urldecode($coord))]))) { $pageG = new geo_param($geoInfo); array_push($foundCoordinates, $pageG); if (GeoExtractor::enableDebug) { echo "<h3>Found infobox entry '" . implode("|", $geoInfo) . "'</h3>"; } } } } /* infoboxes */ $numResults = 0; foreach ($foundCoordinates as $g) { if ($g->is_valid()) { if (GeoExtractor::enablePreview) { ?> <iframe src="http://christianhbecker.com/da/workspace/gmap.php?lat=<?php echo $g->latdeg; ?> &long=<?php echo $g->londeg; ?> &scale=5" style="width: 500px; height: 300px; border: none;" scrolling="no"> </iframe> <?php } /* Only process the first result */ if (++$numResults == 1) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#lat"), RDFtriple::Literal((string) $g->latdeg, "http://www.w3.org/2001/XMLSchema#float", NULL)); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#long"), RDFtriple::Literal((string) $g->londeg, "http://www.w3.org/2001/XMLSchema#float", NULL)); /* * Process additional attributes * See http://en.wikipedia.org/wiki/Wikipedia:WikiProject_Geographical_coordinates#Parameters */ if (isset($g->attr['type'])) { if (isset($this->typeToGeoNames[$g->attr['type']])) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#featureClass"), RDFtriple::URI("http://www.geonames.org/ontology#" . $this->typeToGeoNames[$g->attr['type']][0])); if (isset($this->typeToGeoNames[$g->attr['type']][1])) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#featureCode"), RDFtriple::URI("http://www.geonames.org/ontology#" . $this->typeToGeoNames[$g->attr['type']][0] . '.' . $this->typeToGeoNames[$g->attr['type']][1])); } } /* city(pop): City, town or village with specified population */ if (strtolower($g->attr['type'] == 'city') && isset($g->attr['arg:type'])) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#population"), RDFtriple::Literal($g->attr['arg:type'], "http://www.w3.org/2001/XMLSchema#integer", NULL)); } /* * landmark: Cultural landmark, building of special interest, tourist attration and other points of interest * Mapped to YAGO; could in theory use http://www.eionet.europa.eu/gemet/concept/8525 ("tourist facility"), * but it seems as if any POI not matching the above categories is tagged as a landmark * (=> "Google" etc.) */ if (strtolower($g->attr['type'] == 'landmark')) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://dbpedia.org/class/yago/Landmark108624891")); } } /* type */ /* * region: ISO 3166-1 alpha-2 country code or ISO 3166-2 region code to GeoNames inCountry * Very unreliable, as it just sets a preferred map view, * i.e. Germany's and The Czech Republic's regions are set to "EN".... */ /*if (isset($g->attr['region'])) { $result->addTriple( RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#inCountry"), RDFtriple::URI("http://www.geonames.org/countries/#". substr($g->attr['region'], 0, 2))); } */ } } /* is_valid */ } /* $foundCoordinates */ return $result; }