public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $meta = $this->additionalInfo;
     //generate dc modified
     $date = date('c');
     $datatype = XS_DATETIME;
     $o = RDFtriple::Literal($date, $datatype, "");
     $this->log('trace', $o->toString());
     $result->addTriple($this->getPageURI(), RDFtriple::URI(DC_MODIFIED, false), $o);
     //OAIIDENTIFIER
     $this->log(DEBUG, $meta['oaiId']);
     $result->addTriple($this->getPageURI(), RDFtriple::URI(DBM_OAIIDENTIFIER, false), new RDFLiteral($meta['oaiId'], XS_INTEGER));
     //REVISION
     $revisionURI = 'http://' . $meta['language'] . '.wikipedia.org/w/index.php?title=';
     $revisionURI .= urlencode($pageID) . '&oldid=' . $meta['revision'];
     //http://en.wikipedia.org/w/index.php?title=Robotics&oldid=293678514
     $this->log(DEBUG, $revisionURI);
     $result->addTriple($this->getPageURI(), RDFtriple::URI(DBM_REVISION, false), RDFtriple::URI($revisionURI));
     $editLink = 'http://' . $meta['language'] . '.wikipedia.org/w/index.php?title=';
     $editLink .= urlencode($pageID) . '&action=edit';
     $this->log(DEBUG, $editLink);
     $result->addTriple($this->getPageURI(), RDFtriple::URI(DBM_EDITLINK, false), RDFtriple::URI($editLink));
     return $result;
 }
Example #2
0
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $value = Util::replaceWikiLinks($value);
     //TODO: WARUM NUR IN DIESEM FALL CITE RAUSNEHMEN?
     preg_match_all("/{{2}cite.*?\\}{2}/i", $value, $matches);
     foreach ($matches as $match) {
         if (!array_key_exists(0, $match)) {
             continue;
         }
         $value = str_replace($match[0], Util::replaceTemplates($match[0]), $value);
     }
     $value = Util::removeHtmlTags($value);
     $value = Util::removeHtmlComments($value);
     // get unit exact type
     // Some arguments have a fixed type - e.g. weight_lb -> pounds
     $unitExactType = null;
     //UnitValueParser::parseValue($propvalue, $this->language, array($unit_type, $unit_exact_type, $propkey));
     $parseResultArray = $this->parser->parse($value);
     if (isset($parseResultArray)) {
         foreach ($parseResultArray as $parseResults) {
             $parsedDataType = $parseResults[1];
             if ($parsedDataType == "") {
                 $parsedDataType = null;
             }
             $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal((string) $parseResults[0], $parsedDataType, null));
         }
     } else {
         //TODO: GENERATE LOGFILE WITH UNPARSED VALUES
         $result[] = new RDFTriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($value));
     }
     return $result;
 }
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $parseResults = $this->parser->parse($value);
     ob_start();
     $str = "Date parser \n";
     $str .= "value was: {$value} \n";
     print_r($parseResults);
     $str .= ob_get_contents();
     ob_end_clean();
     Logger::debug($str);
     if (!isset($parseResults)) {
         return $result;
     }
     $datePattern = "/\\d\\d\\d\\d-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])/";
     /*
             if(preg_match($datePattern, $parseResults) != 1)
                 return $result;
     
             $isValidDate = checkdate(
                 substr($parseResults, 5, 2),
                 substr($parseResults, 8, 2),
                 substr($parseResults, 0, 4));
     */
     $isValidDate = true;
     if ($isValidDate) {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parseResults[0], $parseResults[1], null));
         //print_r($result);
     } else {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parseResults, null, null));
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $category = Util::getMediaWikiNamespace($this->language, MW_CATEGORY_NAMESPACE);
     if (preg_match_all("/" . $category . ":(.*)/", $pageID, $match)) {
         $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_PREFLABEL, false), RDFtriple::Literal($this->decode_title($pageTitle), NULL, $this->language));
         $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(SKOS_CONCEPT, false));
         if (preg_match_all("/\\[\\[" . $category . ":(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 // split on | sign
                 if (strpos($match[1], '|') === false) {
                     $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($match[1]);
                 } else {
                     $split = explode('|', $match[1]);
                     $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($split[0]);
                 }
                 try {
                     $object = RDFtriple::URI($object);
                 } catch (Exception $e) {
                     echo 'Caught exception: ', $e->getMessage(), "\n";
                     continue;
                 }
                 $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_BROADER, false), $object);
             }
         }
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $CleanSource = $this->remove_wikicode($pageSource);
     $LongAbstract = $this->extract_abstract($CleanSource, 3000, false, false);
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("abstract"), RDFtriple::Literal($LongAbstract, NULL, $this->language));
     return $result;
 }
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $parseResults = $this->parser->parse($value);
     foreach ($parseResults as $mystring) {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($mystring));
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $CleanSource = $this->remove_wikicode($pageSource);
     $Abstract = $this->extract_abstract($CleanSource);
     //$LongAbstract = $this->extract_abstract($CleanSource, 3000, false, false);
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2000/01/rdf-schema#comment"), RDFtriple::Literal($Abstract, NULL, $this->language));
     return $result;
 }
Example #8
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if ($this->decode_title($pageTitle) == NULL) {
         return $result;
     }
     $result->addTriple($this->getPageURI(), RDFtriple::URI(RDFS_LABEL, false), RDFtriple::Literal($this->decode_title($pageTitle), NULL, $this->language));
     return $result;
 }
Example #9
0
 public function getPredicateTriples()
 {
     $predicateTriples = new ExtractionResult($this->pageID, $this->language, $this->extractorID);
     foreach ($this->predicates as $subject => $bool) {
         // array_push( $predicateTriples, new RDFtriple($subject, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property"));
         $predicateTriples->addTriple(RDFtriple::URI($subject), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#Property"));
         $predicateTriples->addTriple(RDFtriple::URI($subject), RDFtriple::URI("http://www.w3.org/2000/01/rdf-schema#label"), RDFtriple::Literal($this->getPredicateLabel($subject)));
     }
     return $predicateTriples;
 }
Example #10
0
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $image = "image";
     $parsedText = ActiveAbstractExtractor::stripMarkup($value, $image);
     $parsedText = trim($parsedText);
     if ($parsedText != "") {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parsedText, null, $this->language));
     }
     return $result;
 }
Example #11
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     // Locate geo coordinates
     if (preg_match('/<geo>([\\-0-9\\.]+);([\\-0-9\\.]+)[^0-9]*[^<]*<\\/geo>/', $pageSource, $match)) {
         $lat = $match[1];
         $long = $match[2];
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#lat"), RDFtriple::Literal($lat, "http://www.w3.org/2001/XMLSchema#float", NULL));
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#long"), RDFtriple::Literal($long, "http://www.w3.org/2001/XMLSchema#float", NULL));
     }
     return $result;
 }
Example #12
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $PersonData = $this->extractPersondata($pageSource, $this->language);
     //var_dump($PersonData);
     if ($PersonData != null) {
         // preg_match("/\[\[en:(.*)\]\]/", $pageSource, $LangLinkmatch);
         // $PersonData['enPageID'] = str_replace(" ","_",$LangLinkmatch[1]);
         $WikiDB = new DatabaseWikipedia($this->language);
         preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['birthplace'], $Birthplacematch);
         $Birthplacematch = $this->getLinkForLabeledLink($Birthplacematch);
         $mySource = $WikiDB->getSource($Birthplacematch);
         preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch);
         $BirthPlace = $LangLinkmatch[1];
         preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['deathplace'], $Deathplacematch);
         $Deathplacematch = $this->getLinkForLabeledLink($Deathplacematch);
         $mySource = $WikiDB->getSource($Deathplacematch);
         preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch);
         $DeathPlace = $LangLinkmatch[1];
         //var_dump($PersonData);
         //var_dump($BirthPlace);
         //var_dump($DeathPlace);
         //var_dump($Deathplacematch);
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/name"), RDFtriple::Literal($PersonData['name'], null, "de"));
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/givenname"), RDFtriple::Literal($PersonData['givenname'], null, "de"));
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/surname"), RDFtriple::Literal($PersonData['surname'], null, "de"));
         if ($BirthPlace != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("birthPlace"), RDFtriple::page($BirthPlace));
             // $result->addTriple(
             // RDFtriple::page($pageID),
             // RDFtriple::URI("http://purl.org/vocab/bio/0.1/event"),
             // RDFtriple::URI("http://dbpedia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth"));
             // $result->addTriple(
             // RDFtriple::URI("http://dbpedia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth");
             // RDFtriple::URI("http://purl.org/vocab/bio/0.1/place"),
             // RDFtriple::page($BirthPlace));
         }
         if ($PersonData['birthdate'] != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("birth"), RDFtriple::Literal($PersonData['birthdate'], "http://www.w3.org/2001/XMLSchema#date", null));
         }
         if ($DeathPlace != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("deathPlace"), RDFtriple::page($DeathPlace));
         }
         if ($PersonData['deathdate'] != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("death"), RDFtriple::Literal($PersonData['deathdate'], "http://www.w3.org/2001/XMLSchema#date", null));
         }
         if ($PersonData['description'] != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://purl.org/dc/elements/1.1/description"), RDFtriple::Literal($PersonData['description'], null, "de"));
         }
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://xmlns.com/foaf/0.1/Person"));
     }
     return $result;
 }
Example #13
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     //create a new Extraction Result to hold all extrated Triples
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     //Look for {{chembox header}} in PageSource
     if (preg_match("/{{chembox header}}/", $pageSource, $match)) {
         //DO SOME PARSING
         //Add a Triple for each Property
         $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_MY_CHEM_PROPERTY, false), RDFtriple::Literal("my_value"));
         //Add each Predicate to the Predicate Collection
         $this->allPredicates->addPredicate("my_chem_property");
     }
     return $result;
 }
Example #14
0
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     if (strpos($value, "{{") !== false) {
         $value = Util::replaceTemplates($value);
     }
     $value = Util::removeHtmlTags($value);
     $value = Util::removeHtmlComments($value);
     $value = Util::removeWikiEmphasis($value);
     $parseResults = $this->parser->parse($value);
     if (isset($parseResults)) {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parseResults));
     } else {
         //TODO: ADD DEGUB LOGFILE FOR UN-PARSED TRIPLES
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($value));
     }
     return $result;
 }
Example #15
0
 public function findPND($text, $pageTitle, &$result)
 {
     $templates = Util::getTemplates($text);
     foreach ($templates as $template) {
         if ($template["name"] == "Normdaten") {
             preg_match('/\\|\\s*PND\\s*=\\s*([0-9X]*)/i', $template["content"], $match);
             if (isset($match[1])) {
                 // add individualised PND
                 $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_INDIVIDUALISED_PND, false), RDFtriple::Literal($match[1], NULL, NULL));
             }
         } else {
             if ($template["name"] == "PND") {
                 preg_match('/\\s*PND\\s*\\|\\s*([0-9X]*)(.*)/i', $template["content"], $match);
                 if (isset($match)) {
                     if (isset($match[1]) && strlen($match[1]) >= 9) {
                         // add individualised PND
                         $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_INDIVIDUALISED_PND, false), RDFtriple::Literal($match[1], NULL, NULL));
                     }
                     if (isset($match[2])) {
                         preg_match('/\\|\\s*([0-9X]*)/i', $match[2], $match1);
                         if (isset($match1[1]) && strlen($match1[1]) >= 9) {
                             // add non-individualised PND
                             $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_NON_INDIVIDUALISED_PND, false), RDFtriple::Literal($match1[1], NULL, NULL));
                         }
                     }
                 }
             } else {
                 if ($template["name"] == "PNDfehlt") {
                     preg_match('/\\s*PNDfehlt\\s*(\\|\\s*.*)/i', $template["content"], $match);
                     if (isset($match[1])) {
                         preg_match('/\\|\\s*([0-9X]*)/i', $match[1], $match1);
                         if (isset($match1[1]) && strlen($match1[1]) >= 9) {
                             // add non-individualised PND
                             $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_NON_INDIVIDUALISED_PND, false), RDFtriple::Literal($match1[1], NULL, NULL));
                         }
                     }
                 }
             }
         }
     }
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     //TODO image namespace
     $text = $pageSource;
     //TODO not sure what to take as magic number here:
     // 4096 was to short, e.g. inappropriate for london
     $text = substr($text, 0, 8192);
     $text = self::stripMarkup($text);
     //TODO REMOVE THIS LINE FOR DEBUGGING:
     $text = $this->_exceptions($text);
     //2 is probalby perfect, since it guarantuees a certain lentgh
     $firstTwoSentences = $this->_extractStart($text, 2);
     //better than nothing
     $fullabstract = $firstTwoSentences;
     //this is crazy code as it could also be 0
     //it is a heuristical approach to nicen the abstracts.
     // take anything until you find  '=='
     if (($pos = strpos($text, '==')) !== false) {
         $fullabstract = trim(substr($text, 0, $pos));
     } else {
         $fullabstract = trim($text);
     }
     if (!empty($firstTwoSentences)) {
         $s = $this->getPageURI();
         $p = RDFtriple::URI(DBCOMM_COMMENT, false);
         $o = RDFtriple::Literal($firstTwoSentences, NULL, $this->language);
         $this->log('debug', 'Found: ' . $s->toString() . " " . $p->toString() . " " . $o->toString());
         $result->addTriple($s, $p, $o);
     }
     if (!empty($fullabstract)) {
         $s = $this->getPageURI();
         $p = RDFtriple::URI(DBCOMM_ABSTRACT, false);
         $o = RDFtriple::Literal($fullabstract, NULL, $this->language);
         $this->log('debug', 'Found: ' . $s->toString() . " " . $p->toString() . " " . $o->toString());
         $result->addTriple($s, $p, $o);
     }
     //TODO $clipped = substr( $extract, 0, 1024 );
     //TODO UtfNormal::cleanUp( $clipped ); in include/normal/UtfNormal
     return $result;
 }
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $value = Util::removeHtmlTags($value);
     $value = Util::removeHtmlComments($value);
     $parseResultArray = $this->parser->parse($value);
     if (isset($parseResultArray)) {
         foreach ($parseResultArray as $parseResults) {
             $parsedDataType = $parseResults[1];
             if ($parsedDataType == "") {
                 $parsedDataType = null;
             }
             if ($parseResults[0] != "") {
                 $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal((string) $parseResults[0], $parsedDataType, null));
             }
         }
     } else {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($value));
     }
     return $result;
 }
Example #18
0
    public function extractPage($pageID, $pageTitle, $pageSource)
    {
        $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
        $foundCoordinates = array();
        /* Main title */
        if ($geoInfo = $this->extractGeoInfo($pageSource, true)) {
            $pageG = new geo_param($geoInfo);
            array_push($foundCoordinates, $pageG);
            $this->log('debug', "Found title entry '" . implode("|", $geoInfo) . "'");
        }
        /* Coordinates provided in infobox formats */
        $infoboxes = $this->getInfoboxes($pageSource);
        foreach ($infoboxes[1] as $box) {
            $boxProperties = $this->getBoxProperties($box, true);
            foreach (GeoExtractor::$knownInfoboxFormats as $format) {
                /* Initialize global defaults */
                $pieces = array(null, 0, 0, 'N', null, 0, 0, 'E');
                /* Apply template-specific NS/EW defaults */
                if (isset($format[8])) {
                    /* NS default */
                    $pieces[3] = $format[8];
                }
                if (isset($format[9])) {
                    /* EW default */
                    $pieces[7] = $format[9];
                }
                /* Copy from template */
                for ($i = 0; $i < count($format); $i++) {
                    $formatString = urldecode($format[$i]);
                    if ($formatString != "" && isset($boxProperties[strtolower($formatString)])) {
                        /* German coordinates: Treat 'O' (Ost) as 'E' (East) */
                        if ($i == 7 && $boxProperties[strtolower($formatString)] == 'O') {
                            $pieces[$i] = 'E';
                        } else {
                            $pieces[$i] = $boxProperties[strtolower($formatString)];
                        }
                    }
                }
                if (geo_param::is_lat($pieces[0]) && geo_param::is_long($pieces[4])) {
                    $g = new geo_param($pieces);
                    array_push($foundCoordinates, $g);
                    $this->log('debug', "Found format '" . $format[0] . "'");
                }
            }
            /* $this->knownInfoboxFormats */
            /*
             * Look for coordinate tags inside the infobox
             * Used widely, e.g. airports (=> Los_Angeles_International_Airport)
             * These don't have to set the title, as they're in a first-level infobox
             * We could simply take all references, but the idea here is to filter out
             * cases where multiple locations are specified
             */
            foreach (GeoExtractor::$knownTemplatePredicates as $coord) {
                if (isset($boxProperties[strtolower(urldecode($coord))]) && ($geoInfo = $this->extractGeoInfo($boxProperties[strtolower(urldecode($coord))]))) {
                    $pageG = new geo_param($geoInfo);
                    array_push($foundCoordinates, $pageG);
                    $this->log('debug', "Found infobox entry '" . implode("|", $geoInfo) . "'");
                }
            }
        }
        /* infoboxes */
        $numResults = 0;
        foreach ($foundCoordinates as $g) {
            if ($g->is_valid()) {
                if (GeoExtractor::enablePreview) {
                    ?>
                     <iframe src="http://maps.google.com/?q=<?php 
                    echo $g->latdeg;
                    ?>
,<?php 
                    echo $g->londeg;
                    ?>
&z=5" style="width: 1000px; height: 600px; border: none;" scrolling="no">
                     </iframe>
                     <?php 
                }
                /* Only process the first result */
                if (++$numResults == 1) {
                    $sqlParams = array("resource" => $pageID, "lang" => $this->language, "point" => (string) $g->latdeg . " " . (string) $g->londeg);
                    /* Check whether this exact entry is present */
                    if ($this->batchExtraction) {
                        $results = mysql_query($this->sqlParameterize(GeoExtractor::sqlCheckExact, $sqlParams), $this->dbSharedConnection);
                        $row = mysql_fetch_assoc($results);
                        $hasExact = $row['count'] != '0';
                        if ($hasExact) {
                            $hasAny = true;
                        } else {
                            /* Check whether any entry is present */
                            $results = mysql_query($this->sqlParameterize(GeoExtractor::sqlCheckAny, $sqlParams), $this->dbSharedConnection);
                            $row = mysql_fetch_assoc($results);
                            $hasAny = $row['count'] != '0';
                        }
                    } else {
                        $hasExact = $hasAny = false;
                    }
                    if ($hasAny) {
                        $this->log('debug', "Not generating geocoordinates because coordinates were previously generated for this resource");
                    } else {
                        /* Store in results table for duplicate detection */
                        if ($this->batchExtraction) {
                            mysql_query($this->sqlParameterize(GeoExtractor::sqlInsertPoint, $sqlParams), $this->dbSharedConnection);
                        }
                        /* Triple generation */
                        /* W3C Geospatial Vocabulary (GeoRSS) */
                        $result->addTriple($this->getPageURI(), RDFtriple::URI(GEORSS_POINT, false), RDFtriple::Literal((string) $g->latdeg . " " . (string) $g->londeg));
                        /* Basic Geo Vocabulary - only add them if there are no points for the resource so far; it'd be ambigous otherwise! */
                        if ($hasAny) {
                            $this->log('debug', "Not generating W3C Basic geocoordinates because basic coordinates were previously generated for this resource");
                        } else {
                            $result->addTriple($this->getPageURI(), RDFtriple::URI(WGS_LAT, false), RDFtriple::Literal((string) $g->latdeg, XS_FLOAT, NULL));
                            $result->addTriple($this->getPageURI(), RDFtriple::URI(WGS_LONG, false), RDFtriple::Literal((string) $g->londeg, XS_FLOAT, NULL));
                        }
                        /*
                         * Process additional attributes
                         * See http://en.wikipedia.org/wiki/Wikipedia:WikiProject_Geographical_coordinates#Parameters
                         * and http://de.wikipedia.org/wiki/Wikipedia:WikiProjekt_Georeferenzierung/Neue_Koordinatenvorlage#Parameter
                         */
                        if (isset($g->attr['type'])) {
                            if (isset(GeoExtractor::$typeToGeoNames[$g->attr['type']])) {
                                $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_FEATURECLASS, false), RDFtriple::URI(GEONAMES_NS . GeoExtractor::$typeToGeoNames[$g->attr['type']][0]));
                                if (isset(GeoExtractor::$typeToGeoNames[$g->attr['type']][1])) {
                                    $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_FEATURECODE, false), RDFtriple::URI(GEONAMES_NS . GeoExtractor::$typeToGeoNames[$g->attr['type']][0] . '.' . GeoExtractor::$typeToGeoNames[$g->attr['type']][1]));
                                }
                            }
                            /* city(pop): City, town or village with specified population */
                            if (strtolower($g->attr['type'] == 'city') && isset($g->attr['arg:type'])) {
                                $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_POPULATION, false), RDFtriple::Literal((string) $g->attr['arg:type'], XS_INTEGER, NULL));
                            }
                            /*
                             * landmark: Cultural landmark, building of special interest, tourist attration and other points of interest
                             * Mapped to YAGO; could in theory use http://www.eionet.europa.eu/gemet/concept/8525 ("tourist facility"),
                             * but it seems as if any POI not matching the above categories is tagged as a landmark
                             * (=> "Google" etc.)
                             */
                            if (strtolower($g->attr['type'] == 'landmark')) {
                                $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(YAGO_LANDMARK, false));
                            }
                        }
                        /* type */
                        /* population */
                        if (isset($g->attr['pop'])) {
                            $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_POPULATION, false), RDFtriple::Literal((string) $g->attr['pop'], XS_INTEGER, NULL));
                        }
                        /* elevation in meters above sea level
                         * TODO must be converted from sea level *to* wgs84 ellipsoid!
                         */
                        /*if (isset($g->attr['elevation']))  {
                              $result->addTriple(
                                      $this->getPageURI(),
                                      RDFtriple::URI("http://www.georss.org/georss/elev"),
                                      RDFtriple::Literal((string) $g->attr['elevation']);
                          }*/
                        /* Diameter (m) to GeoRSS radius (m) */
                        if (isset($g->attr['dim'])) {
                            $result->addTriple($this->getPageURI(), RDFtriple::URI(GEORSS_RADIUS, false), RDFtriple::Literal((string) ($g->attr['dim'] / 2), XS_DECIMAL, NULL));
                        }
                        /*
                         * region: ISO 3166-1 alpha-2 country code or ISO 3166-2 region code to GeoNames inCountry
                         * Very unreliable, as it just sets a preferred map view,
                         * i.e. Germany's and The Czech Republic's regions are set to "EN"....
                         */
                        /*if (isset($g->attr['region']))
                          {
                                  $result->addTriple(
                                          $this->getPageURI(),
                                          RDFtriple::URI("http://www.geonames.org/ontology#inCountry"),
                                          RDFtriple::URI("http://www.geonames.org/countries/#".
                                                          substr($g->attr['region'], 0, 2)));
                          }  */
                    }
                    /* !$hasExact */
                }
                /* first entry */
            }
            /* is_valid */
        }
        /* $foundCoordinates */
        return $result;
    }
Example #19
0
 /**
  * Write given triple to given destination.
  * @param $destination must not be null
  * @param $pageID will be turned into a RDF URI using RDFtriple::page()
  * @param $predicate must be a RDF URI
  * @param $text will be turned into a RDF literal using RDFtriple::Literal()
  * @return void
  */
 private function writeTriple($destination, $pageID, $predicate, $text)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $subject = RDFtriple::page($pageID);
     $object = RDFtriple::Literal($text, NULL, $this->language);
     // $this->log('warn','Found: '.$subject->toString()." ".$predicate->toString()." ".$object->toString());
     $result->addTriple($subject, $predicate, $object);
     Timer::start('destination:accept');
     $destination->accept($result);
     Timer::stop('destination:accept');
 }
Example #20
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $PersonData = $this->extractPersondata($pageSource, $this->language);
     //var_dump($PersonData);
     if ($PersonData != null) {
         // preg_match("/\[\[en:(.*)\]\]/", $pageSource, $LangLinkmatch);
         // $PersonData['enPageID'] = str_replace(" ","_",$LangLinkmatch[1]);
         if (Options::getOption('Persondata.usedb')) {
             $WikiDB = new DatabaseWikipediaCollection($this->language);
         }
         $mysource = "";
         if (isset($PersonData['birthplace'])) {
             preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['birthplace'], $Birthplacematch);
             if (isset($Birthplacematch[0])) {
                 $Birthplacematch = $this->getLinkForLabeledLink($Birthplacematch);
                 if (Options::getOption('Persondata.usedb')) {
                     $mySource = $WikiDB->getSource($Birthplacematch);
                 }
                 preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch);
                 if (isset($LangLinkmatch[1])) {
                     $BirthPlace = $LangLinkmatch[1];
                 }
             }
         }
         if (isset($PersonData['deathplace'])) {
             preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['deathplace'], $Deathplacematch);
             if (isset($Deathplacematch[0])) {
                 $Deathplacematch = $this->getLinkForLabeledLink($Deathplacematch);
                 if (Options::getOption('Persondata.usedb')) {
                     $mySource = $WikiDB->getSource($Deathplacematch);
                 }
                 preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch);
                 if (isset($LangLinkmatch[1])) {
                     $DeathPlace = $LangLinkmatch[1];
                 }
             }
         }
         //var_dump($PersonData);
         //var_dump($BirthPlace);
         //var_dump($DeathPlace);
         //var_dump($Deathplacematch);
         if (isset($PersonData['name']) && $PersonData['name'] != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_NAME, false), RDFtriple::Literal($PersonData['name'], null, "de"));
         }
         if (isset($PersonData['givenname']) && $PersonData['givenname'] != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_GIVENNAME, false), RDFtriple::Literal($PersonData['givenname'], null, "de"));
         }
         if (isset($PersonData['surname']) && $PersonData['surname'] != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_SURNAME, false), RDFtriple::Literal($PersonData['surname'], null, "de"));
         }
         if (isset($BirthPlace) && $BirthPlace != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_BIRTHPLACE, false), RDFtriple::page($BirthPlace));
             // $result->addTriple(
             // RDFtriple::page($pageID),
             // RDFtriple::URI("http://purl.org/vocab/bio/0.1/event"),
             // RDFtriple::URI("http://dbp     edia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth"));
             // $result->addTriple(
             // RDFtriple::URI("http://dbp     edia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth");
             // RDFtriple::URI("http://purl.org/vocab/bio/0.1/place"),
             // RDFtriple::page($BirthPlace));
         }
         if (isset($PersonData['birthdate']) && $PersonData['birthdate'] != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_BIRTH, false), RDFtriple::Literal($PersonData['birthdate'], XS_DATE, null));
         }
         if (isset($DeathPlace) && $DeathPlace != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_DEATHPLACE, false), RDFtriple::page($DeathPlace));
         }
         if (isset($PersonData['deathdate']) && $PersonData['deathdate'] != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_DEATH, false), RDFtriple::Literal($PersonData['deathdate'], XS_DATE, null));
         }
         if (isset($PersonData['description']) && $PersonData['description'] != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(DC_DESCRIPTION, false), RDFtriple::Literal($PersonData['description'], null, "de"));
         }
         $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(FOAF_PERSON, false));
     }
     return $result;
 }
 private function addLiteral($result, $pageID, $class, $property, $value, $datatype = null, $lang = null)
 {
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::property($class, $property, $this->flagNewSchema), RDFtriple::Literal($value, $datatype, $lang));
 }
Example #22
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2000/01/rdf-schema#label"), RDFtriple::Literal($this->decode_title($pageTitle), NULL, $this->language));
     return $result;
 }
Example #23
0
 public function addDCModifiedAnnotation()
 {
     $p = RDFtriple::URI(DC_MODIFIED, false);
     $o = RDFtriple::Literal(date('c'), XS_DATETIME, "");
     $this->addOWLAxiomAnnotation($p, $o);
 }
Example #24
0
    public function extractPage($pageID, $pageTitle, $pageSource)
    {
        $result = new ExtractionResult($pageID, $this->language, self::extractorID);
        $foundCoordinates = array();
        /* Main title */
        if ($geoInfo = $this->extractGeoInfo($pageSource, true)) {
            $pageG = new geo_param($geoInfo);
            array_push($foundCoordinates, $pageG);
            if (GeoExtractor::enableDebug) {
                echo "<h3>Found title entry '" . implode("|", $geoInfo) . "'</h3>";
            }
        }
        /* Coordinates provided in infobox formats */
        $infoboxes = $this->getInfoboxes($pageSource);
        foreach ($infoboxes[1] as $box) {
            $boxProperties = $this->getBoxProperties($box, true);
            foreach ($this->knownInfoboxFormats as $format) {
                /* Initialize global defaults */
                $pieces = array(null, 0, 0, 'N', null, 0, 0, 'E');
                /* Apply template-specific NS/EW defaults */
                if (isset($format[8])) {
                    /* NS default */
                    $pieces[3] = $format[8];
                }
                if (isset($format[9])) {
                    /* EW default */
                    $pieces[7] = $format[9];
                }
                /* Copy from template */
                for ($i = 0; $i < count($format); $i++) {
                    $formatString = urldecode($format[$i]);
                    if ($formatString != "" && isset($boxProperties[strtolower($formatString)])) {
                        /* German coordinates: Treat 'O' (Ost) as 'E' (East) */
                        if ($i == 7 && $boxProperties[strtolower($formatString)] == 'O') {
                            $pieces[$i] = 'E';
                        } else {
                            $pieces[$i] = $boxProperties[strtolower($formatString)];
                        }
                    }
                }
                if (geo_param::is_lat($pieces[0]) && geo_param::is_long($pieces[4])) {
                    $g = new geo_param($pieces);
                    array_push($foundCoordinates, $g);
                    if (GeoExtractor::enableDebug) {
                        echo "<h3>Found format '" . $format[0] . "'</h3>";
                    }
                }
            }
            /* $this->knownInfoboxFormats */
            /*
             * Look for coordinate tags inside the infobox
             * Used widely, e.g. airports (=> Los_Angeles_International_Airport)
             * These don't have to set the title, as they're in a first-level infobox
             * We could simply take all references, but the idea here is to filter out
             * cases where multiple locations are specified
             */
            foreach ($this->knownTemplatePredicates as $coord) {
                if (isset($boxProperties[strtolower(urldecode($coord))]) && ($geoInfo = $this->extractGeoInfo($boxProperties[strtolower(urldecode($coord))]))) {
                    $pageG = new geo_param($geoInfo);
                    array_push($foundCoordinates, $pageG);
                    if (GeoExtractor::enableDebug) {
                        echo "<h3>Found infobox entry '" . implode("|", $geoInfo) . "'</h3>";
                    }
                }
            }
        }
        /* infoboxes */
        $numResults = 0;
        foreach ($foundCoordinates as $g) {
            if ($g->is_valid()) {
                if (GeoExtractor::enablePreview) {
                    ?>
                     <iframe src="http://christianhbecker.com/da/workspace/gmap.php?lat=<?php 
                    echo $g->latdeg;
                    ?>
&long=<?php 
                    echo $g->londeg;
                    ?>
&scale=5" style="width: 500px; height: 300px; border: none;" scrolling="no">
                     </iframe>
                     <?php 
                }
                /* Only process the first result */
                if (++$numResults == 1) {
                    $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#lat"), RDFtriple::Literal((string) $g->latdeg, "http://www.w3.org/2001/XMLSchema#float", NULL));
                    $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#long"), RDFtriple::Literal((string) $g->londeg, "http://www.w3.org/2001/XMLSchema#float", NULL));
                    /*
                     * Process additional attributes
                     * See http://en.wikipedia.org/wiki/Wikipedia:WikiProject_Geographical_coordinates#Parameters
                     */
                    if (isset($g->attr['type'])) {
                        if (isset($this->typeToGeoNames[$g->attr['type']])) {
                            $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#featureClass"), RDFtriple::URI("http://www.geonames.org/ontology#" . $this->typeToGeoNames[$g->attr['type']][0]));
                            if (isset($this->typeToGeoNames[$g->attr['type']][1])) {
                                $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#featureCode"), RDFtriple::URI("http://www.geonames.org/ontology#" . $this->typeToGeoNames[$g->attr['type']][0] . '.' . $this->typeToGeoNames[$g->attr['type']][1]));
                            }
                        }
                        /* city(pop): City, town or village with specified population */
                        if (strtolower($g->attr['type'] == 'city') && isset($g->attr['arg:type'])) {
                            $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#population"), RDFtriple::Literal($g->attr['arg:type'], "http://www.w3.org/2001/XMLSchema#integer", NULL));
                        }
                        /*
                         * landmark: Cultural landmark, building of special interest, tourist attration and other points of interest
                         * Mapped to YAGO; could in theory use http://www.eionet.europa.eu/gemet/concept/8525 ("tourist facility"),
                         * but it seems as if any POI not matching the above categories is tagged as a landmark
                         * (=> "Google" etc.)
                         */
                        if (strtolower($g->attr['type'] == 'landmark')) {
                            $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://dbpedia.org/class/yago/Landmark108624891"));
                        }
                    }
                    /* type */
                    /*
                     * region: ISO 3166-1 alpha-2 country code or ISO 3166-2 region code to GeoNames inCountry
                     * Very unreliable, as it just sets a preferred map view, 
                     * i.e. Germany's and The Czech Republic's regions are set to "EN"....
                     */
                    /*if (isset($g->attr['region']))
                      {
                              $result->addTriple(
                                      RDFtriple::page($pageID), 
                                      RDFtriple::URI("http://www.geonames.org/ontology#inCountry"),
                                      RDFtriple::URI("http://www.geonames.org/countries/#".
                                                      substr($g->attr['region'], 0, 2)));
                      }  */
                }
            }
            /* is_valid */
        }
        /* $foundCoordinates */
        return $result;
    }
Example #25
0
/**
 * A wrapper for parseAttributeValue
 *
 *
 * @global <type> $parseResult
 * @param <type> $value
 * @param <type> $templateChildName
 * @param <type> $propertyName
 * @param <type> $language
 * @return <type>
 */
function parseAttributeValueWrapper($value, $templateChildName, $propertyName, $language)
{
    $result = array();
    global $parseResult;
    $parseResult = null;
    $localResult = parseAttributeValue($value, $templateChildName, $propertyName, $language);
    $items = array();
    // remap local and global results into a uniform schema
    if (isset($parseResult)) {
        foreach ($parseResult as $item) {
            list(, , $o, $ot, $dt, $ol) = $item;
            $items[] = array($o, $ot, $dt, $ol);
        }
    }
    $parseResult = null;
    if (isset($localResult)) {
        list($o, $ot, $dt, $ol) = $localResult;
        $items[] = array($o, $ot, $dt, $ol);
    }
    foreach ($items as $item) {
        $object = $item[0];
        $objectType = $item[1];
        $dataType = $item[2];
        $objectLanguage = $item[3];
        // And another hack... we pass the language to the
        // parse function, and we don't get it back...
        if (!isset($objectLanguage)) {
            $objectLanguage = $language;
        }
        // special newline handling
        $br = array('<br>', '<br/>', '<br />');
        if ($objectType == 'l') {
            $object = str_replace($br, "\n", $object);
        } else {
            if ($objectType == 'r') {
                $object = str_replace($br, '', $object);
            }
        }
        //echo "Got object type '$objectType'\n";
        if ($objectType == "r") {
            $object = RDFtriple::URI($object);
        } else {
            if ($objectType == "l") {
                $object = RDFtriple::Literal($object, $dataType, $objectLanguage);
            } else {
                Logger::warn("Shouldn't happen - found a blank node where none expected - objectType = {$objectType}");
                continue;
            }
        }
        $result[] = new RDFtriple(RDFtriple::page($templateChildName), RDFtriple::URI($propertyName), $object);
    }
    return $result;
}