Beispiel #1
0
    public function extractPage($pageID, $pageTitle, $pageSource)
    {
        $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
        $foundCoordinates = array();
        /* Main title */
        if ($geoInfo = $this->extractGeoInfo($pageSource, true)) {
            $pageG = new geo_param($geoInfo);
            array_push($foundCoordinates, $pageG);
            $this->log('debug', "Found title entry '" . implode("|", $geoInfo) . "'");
        }
        /* Coordinates provided in infobox formats */
        $infoboxes = $this->getInfoboxes($pageSource);
        foreach ($infoboxes[1] as $box) {
            $boxProperties = $this->getBoxProperties($box, true);
            foreach (GeoExtractor::$knownInfoboxFormats as $format) {
                /* Initialize global defaults */
                $pieces = array(null, 0, 0, 'N', null, 0, 0, 'E');
                /* Apply template-specific NS/EW defaults */
                if (isset($format[8])) {
                    /* NS default */
                    $pieces[3] = $format[8];
                }
                if (isset($format[9])) {
                    /* EW default */
                    $pieces[7] = $format[9];
                }
                /* Copy from template */
                for ($i = 0; $i < count($format); $i++) {
                    $formatString = urldecode($format[$i]);
                    if ($formatString != "" && isset($boxProperties[strtolower($formatString)])) {
                        /* German coordinates: Treat 'O' (Ost) as 'E' (East) */
                        if ($i == 7 && $boxProperties[strtolower($formatString)] == 'O') {
                            $pieces[$i] = 'E';
                        } else {
                            $pieces[$i] = $boxProperties[strtolower($formatString)];
                        }
                    }
                }
                if (geo_param::is_lat($pieces[0]) && geo_param::is_long($pieces[4])) {
                    $g = new geo_param($pieces);
                    array_push($foundCoordinates, $g);
                    $this->log('debug', "Found format '" . $format[0] . "'");
                }
            }
            /* $this->knownInfoboxFormats */
            /*
             * Look for coordinate tags inside the infobox
             * Used widely, e.g. airports (=> Los_Angeles_International_Airport)
             * These don't have to set the title, as they're in a first-level infobox
             * We could simply take all references, but the idea here is to filter out
             * cases where multiple locations are specified
             */
            foreach (GeoExtractor::$knownTemplatePredicates as $coord) {
                if (isset($boxProperties[strtolower(urldecode($coord))]) && ($geoInfo = $this->extractGeoInfo($boxProperties[strtolower(urldecode($coord))]))) {
                    $pageG = new geo_param($geoInfo);
                    array_push($foundCoordinates, $pageG);
                    $this->log('debug', "Found infobox entry '" . implode("|", $geoInfo) . "'");
                }
            }
        }
        /* infoboxes */
        $numResults = 0;
        foreach ($foundCoordinates as $g) {
            if ($g->is_valid()) {
                if (GeoExtractor::enablePreview) {
                    ?>
                     <iframe src="http://maps.google.com/?q=<?php 
                    echo $g->latdeg;
                    ?>
,<?php 
                    echo $g->londeg;
                    ?>
&z=5" style="width: 1000px; height: 600px; border: none;" scrolling="no">
                     </iframe>
                     <?php 
                }
                /* Only process the first result */
                if (++$numResults == 1) {
                    $sqlParams = array("resource" => $pageID, "lang" => $this->language, "point" => (string) $g->latdeg . " " . (string) $g->londeg);
                    /* Check whether this exact entry is present */
                    if ($this->batchExtraction) {
                        $results = mysql_query($this->sqlParameterize(GeoExtractor::sqlCheckExact, $sqlParams), $this->dbSharedConnection);
                        $row = mysql_fetch_assoc($results);
                        $hasExact = $row['count'] != '0';
                        if ($hasExact) {
                            $hasAny = true;
                        } else {
                            /* Check whether any entry is present */
                            $results = mysql_query($this->sqlParameterize(GeoExtractor::sqlCheckAny, $sqlParams), $this->dbSharedConnection);
                            $row = mysql_fetch_assoc($results);
                            $hasAny = $row['count'] != '0';
                        }
                    } else {
                        $hasExact = $hasAny = false;
                    }
                    if ($hasAny) {
                        $this->log('debug', "Not generating geocoordinates because coordinates were previously generated for this resource");
                    } else {
                        /* Store in results table for duplicate detection */
                        if ($this->batchExtraction) {
                            mysql_query($this->sqlParameterize(GeoExtractor::sqlInsertPoint, $sqlParams), $this->dbSharedConnection);
                        }
                        /* Triple generation */
                        /* W3C Geospatial Vocabulary (GeoRSS) */
                        $result->addTriple($this->getPageURI(), RDFtriple::URI(GEORSS_POINT, false), RDFtriple::Literal((string) $g->latdeg . " " . (string) $g->londeg));
                        /* Basic Geo Vocabulary - only add them if there are no points for the resource so far; it'd be ambigous otherwise! */
                        if ($hasAny) {
                            $this->log('debug', "Not generating W3C Basic geocoordinates because basic coordinates were previously generated for this resource");
                        } else {
                            $result->addTriple($this->getPageURI(), RDFtriple::URI(WGS_LAT, false), RDFtriple::Literal((string) $g->latdeg, XS_FLOAT, NULL));
                            $result->addTriple($this->getPageURI(), RDFtriple::URI(WGS_LONG, false), RDFtriple::Literal((string) $g->londeg, XS_FLOAT, NULL));
                        }
                        /*
                         * Process additional attributes
                         * See http://en.wikipedia.org/wiki/Wikipedia:WikiProject_Geographical_coordinates#Parameters
                         * and http://de.wikipedia.org/wiki/Wikipedia:WikiProjekt_Georeferenzierung/Neue_Koordinatenvorlage#Parameter
                         */
                        if (isset($g->attr['type'])) {
                            if (isset(GeoExtractor::$typeToGeoNames[$g->attr['type']])) {
                                $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_FEATURECLASS, false), RDFtriple::URI(GEONAMES_NS . GeoExtractor::$typeToGeoNames[$g->attr['type']][0]));
                                if (isset(GeoExtractor::$typeToGeoNames[$g->attr['type']][1])) {
                                    $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_FEATURECODE, false), RDFtriple::URI(GEONAMES_NS . GeoExtractor::$typeToGeoNames[$g->attr['type']][0] . '.' . GeoExtractor::$typeToGeoNames[$g->attr['type']][1]));
                                }
                            }
                            /* city(pop): City, town or village with specified population */
                            if (strtolower($g->attr['type'] == 'city') && isset($g->attr['arg:type'])) {
                                $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_POPULATION, false), RDFtriple::Literal((string) $g->attr['arg:type'], XS_INTEGER, NULL));
                            }
                            /*
                             * landmark: Cultural landmark, building of special interest, tourist attration and other points of interest
                             * Mapped to YAGO; could in theory use http://www.eionet.europa.eu/gemet/concept/8525 ("tourist facility"),
                             * but it seems as if any POI not matching the above categories is tagged as a landmark
                             * (=> "Google" etc.)
                             */
                            if (strtolower($g->attr['type'] == 'landmark')) {
                                $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(YAGO_LANDMARK, false));
                            }
                        }
                        /* type */
                        /* population */
                        if (isset($g->attr['pop'])) {
                            $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_POPULATION, false), RDFtriple::Literal((string) $g->attr['pop'], XS_INTEGER, NULL));
                        }
                        /* elevation in meters above sea level
                         * TODO must be converted from sea level *to* wgs84 ellipsoid!
                         */
                        /*if (isset($g->attr['elevation']))  {
                              $result->addTriple(
                                      $this->getPageURI(),
                                      RDFtriple::URI("http://www.georss.org/georss/elev"),
                                      RDFtriple::Literal((string) $g->attr['elevation']);
                          }*/
                        /* Diameter (m) to GeoRSS radius (m) */
                        if (isset($g->attr['dim'])) {
                            $result->addTriple($this->getPageURI(), RDFtriple::URI(GEORSS_RADIUS, false), RDFtriple::Literal((string) ($g->attr['dim'] / 2), XS_DECIMAL, NULL));
                        }
                        /*
                         * region: ISO 3166-1 alpha-2 country code or ISO 3166-2 region code to GeoNames inCountry
                         * Very unreliable, as it just sets a preferred map view,
                         * i.e. Germany's and The Czech Republic's regions are set to "EN"....
                         */
                        /*if (isset($g->attr['region']))
                          {
                                  $result->addTriple(
                                          $this->getPageURI(),
                                          RDFtriple::URI("http://www.geonames.org/ontology#inCountry"),
                                          RDFtriple::URI("http://www.geonames.org/countries/#".
                                                          substr($g->attr['region'], 0, 2)));
                          }  */
                    }
                    /* !$hasExact */
                }
                /* first entry */
            }
            /* is_valid */
        }
        /* $foundCoordinates */
        return $result;
    }
Beispiel #2
0
    public function extractPage($pageID, $pageTitle, $pageSource)
    {
        $result = new ExtractionResult($pageID, $this->language, self::extractorID);
        $foundCoordinates = array();
        /* Main title */
        if ($geoInfo = $this->extractGeoInfo($pageSource, true)) {
            $pageG = new geo_param($geoInfo);
            array_push($foundCoordinates, $pageG);
            if (GeoExtractor::enableDebug) {
                echo "<h3>Found title entry '" . implode("|", $geoInfo) . "'</h3>";
            }
        }
        /* Coordinates provided in infobox formats */
        $infoboxes = $this->getInfoboxes($pageSource);
        foreach ($infoboxes[1] as $box) {
            $boxProperties = $this->getBoxProperties($box, true);
            foreach ($this->knownInfoboxFormats as $format) {
                /* Initialize global defaults */
                $pieces = array(null, 0, 0, 'N', null, 0, 0, 'E');
                /* Apply template-specific NS/EW defaults */
                if (isset($format[8])) {
                    /* NS default */
                    $pieces[3] = $format[8];
                }
                if (isset($format[9])) {
                    /* EW default */
                    $pieces[7] = $format[9];
                }
                /* Copy from template */
                for ($i = 0; $i < count($format); $i++) {
                    $formatString = urldecode($format[$i]);
                    if ($formatString != "" && isset($boxProperties[strtolower($formatString)])) {
                        /* German coordinates: Treat 'O' (Ost) as 'E' (East) */
                        if ($i == 7 && $boxProperties[strtolower($formatString)] == 'O') {
                            $pieces[$i] = 'E';
                        } else {
                            $pieces[$i] = $boxProperties[strtolower($formatString)];
                        }
                    }
                }
                if (geo_param::is_lat($pieces[0]) && geo_param::is_long($pieces[4])) {
                    $g = new geo_param($pieces);
                    array_push($foundCoordinates, $g);
                    if (GeoExtractor::enableDebug) {
                        echo "<h3>Found format '" . $format[0] . "'</h3>";
                    }
                }
            }
            /* $this->knownInfoboxFormats */
            /*
             * Look for coordinate tags inside the infobox
             * Used widely, e.g. airports (=> Los_Angeles_International_Airport)
             * These don't have to set the title, as they're in a first-level infobox
             * We could simply take all references, but the idea here is to filter out
             * cases where multiple locations are specified
             */
            foreach ($this->knownTemplatePredicates as $coord) {
                if (isset($boxProperties[strtolower(urldecode($coord))]) && ($geoInfo = $this->extractGeoInfo($boxProperties[strtolower(urldecode($coord))]))) {
                    $pageG = new geo_param($geoInfo);
                    array_push($foundCoordinates, $pageG);
                    if (GeoExtractor::enableDebug) {
                        echo "<h3>Found infobox entry '" . implode("|", $geoInfo) . "'</h3>";
                    }
                }
            }
        }
        /* infoboxes */
        $numResults = 0;
        foreach ($foundCoordinates as $g) {
            if ($g->is_valid()) {
                if (GeoExtractor::enablePreview) {
                    ?>
                     <iframe src="http://christianhbecker.com/da/workspace/gmap.php?lat=<?php 
                    echo $g->latdeg;
                    ?>
&long=<?php 
                    echo $g->londeg;
                    ?>
&scale=5" style="width: 500px; height: 300px; border: none;" scrolling="no">
                     </iframe>
                     <?php 
                }
                /* Only process the first result */
                if (++$numResults == 1) {
                    $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#lat"), RDFtriple::Literal((string) $g->latdeg, "http://www.w3.org/2001/XMLSchema#float", NULL));
                    $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#long"), RDFtriple::Literal((string) $g->londeg, "http://www.w3.org/2001/XMLSchema#float", NULL));
                    /*
                     * Process additional attributes
                     * See http://en.wikipedia.org/wiki/Wikipedia:WikiProject_Geographical_coordinates#Parameters
                     */
                    if (isset($g->attr['type'])) {
                        if (isset($this->typeToGeoNames[$g->attr['type']])) {
                            $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#featureClass"), RDFtriple::URI("http://www.geonames.org/ontology#" . $this->typeToGeoNames[$g->attr['type']][0]));
                            if (isset($this->typeToGeoNames[$g->attr['type']][1])) {
                                $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#featureCode"), RDFtriple::URI("http://www.geonames.org/ontology#" . $this->typeToGeoNames[$g->attr['type']][0] . '.' . $this->typeToGeoNames[$g->attr['type']][1]));
                            }
                        }
                        /* city(pop): City, town or village with specified population */
                        if (strtolower($g->attr['type'] == 'city') && isset($g->attr['arg:type'])) {
                            $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#population"), RDFtriple::Literal($g->attr['arg:type'], "http://www.w3.org/2001/XMLSchema#integer", NULL));
                        }
                        /*
                         * landmark: Cultural landmark, building of special interest, tourist attration and other points of interest
                         * Mapped to YAGO; could in theory use http://www.eionet.europa.eu/gemet/concept/8525 ("tourist facility"),
                         * but it seems as if any POI not matching the above categories is tagged as a landmark
                         * (=> "Google" etc.)
                         */
                        if (strtolower($g->attr['type'] == 'landmark')) {
                            $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://dbpedia.org/class/yago/Landmark108624891"));
                        }
                    }
                    /* type */
                    /*
                     * region: ISO 3166-1 alpha-2 country code or ISO 3166-2 region code to GeoNames inCountry
                     * Very unreliable, as it just sets a preferred map view, 
                     * i.e. Germany's and The Czech Republic's regions are set to "EN"....
                     */
                    /*if (isset($g->attr['region']))
                      {
                              $result->addTriple(
                                      RDFtriple::page($pageID), 
                                      RDFtriple::URI("http://www.geonames.org/ontology#inCountry"),
                                      RDFtriple::URI("http://www.geonames.org/countries/#".
                                                      substr($g->attr['region'], 0, 2)));
                      }  */
                }
            }
            /* is_valid */
        }
        /* $foundCoordinates */
        return $result;
    }