public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $meta = $this->additionalInfo; //generate dc modified $date = date('c'); $datatype = XS_DATETIME; $o = RDFtriple::Literal($date, $datatype, ""); $this->log('trace', $o->toString()); $result->addTriple($this->getPageURI(), RDFtriple::URI(DC_MODIFIED, false), $o); //OAIIDENTIFIER $this->log(DEBUG, $meta['oaiId']); $result->addTriple($this->getPageURI(), RDFtriple::URI(DBM_OAIIDENTIFIER, false), new RDFLiteral($meta['oaiId'], XS_INTEGER)); //REVISION $revisionURI = 'http://' . $meta['language'] . '.wikipedia.org/w/index.php?title='; $revisionURI .= urlencode($pageID) . '&oldid=' . $meta['revision']; //http://en.wikipedia.org/w/index.php?title=Robotics&oldid=293678514 $this->log(DEBUG, $revisionURI); $result->addTriple($this->getPageURI(), RDFtriple::URI(DBM_REVISION, false), RDFtriple::URI($revisionURI)); $editLink = 'http://' . $meta['language'] . '.wikipedia.org/w/index.php?title='; $editLink .= urlencode($pageID) . '&action=edit'; $this->log(DEBUG, $editLink); $result->addTriple($this->getPageURI(), RDFtriple::URI(DBM_EDITLINK, false), RDFtriple::URI($editLink)); return $result; }
public function generate($subjectName, $propertyName, $value) { $result = array(); $value = Util::replaceWikiLinks($value); //TODO: WARUM NUR IN DIESEM FALL CITE RAUSNEHMEN? preg_match_all("/{{2}cite.*?\\}{2}/i", $value, $matches); foreach ($matches as $match) { if (!array_key_exists(0, $match)) { continue; } $value = str_replace($match[0], Util::replaceTemplates($match[0]), $value); } $value = Util::removeHtmlTags($value); $value = Util::removeHtmlComments($value); // get unit exact type // Some arguments have a fixed type - e.g. weight_lb -> pounds $unitExactType = null; //UnitValueParser::parseValue($propvalue, $this->language, array($unit_type, $unit_exact_type, $propkey)); $parseResultArray = $this->parser->parse($value); if (isset($parseResultArray)) { foreach ($parseResultArray as $parseResults) { $parsedDataType = $parseResults[1]; if ($parsedDataType == "") { $parsedDataType = null; } $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal((string) $parseResults[0], $parsedDataType, null)); } } else { //TODO: GENERATE LOGFILE WITH UNPARSED VALUES $result[] = new RDFTriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($value)); } return $result; }
public function generate($subjectName, $propertyName, $value) { $result = array(); $parseResults = $this->parser->parse($value); ob_start(); $str = "Date parser \n"; $str .= "value was: {$value} \n"; print_r($parseResults); $str .= ob_get_contents(); ob_end_clean(); Logger::debug($str); if (!isset($parseResults)) { return $result; } $datePattern = "/\\d\\d\\d\\d-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])/"; /* if(preg_match($datePattern, $parseResults) != 1) return $result; $isValidDate = checkdate( substr($parseResults, 5, 2), substr($parseResults, 8, 2), substr($parseResults, 0, 4)); */ $isValidDate = true; if ($isValidDate) { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parseResults[0], $parseResults[1], null)); //print_r($result); } else { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parseResults, null, null)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $category = Util::getMediaWikiNamespace($this->language, MW_CATEGORY_NAMESPACE); if (preg_match_all("/" . $category . ":(.*)/", $pageID, $match)) { $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_PREFLABEL, false), RDFtriple::Literal($this->decode_title($pageTitle), NULL, $this->language)); $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(SKOS_CONCEPT, false)); if (preg_match_all("/\\[\\[" . $category . ":(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { // split on | sign if (strpos($match[1], '|') === false) { $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($match[1]); } else { $split = explode('|', $match[1]); $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($split[0]); } try { $object = RDFtriple::URI($object); } catch (Exception $e) { echo 'Caught exception: ', $e->getMessage(), "\n"; continue; } $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_BROADER, false), $object); } } } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $CleanSource = $this->remove_wikicode($pageSource); $LongAbstract = $this->extract_abstract($CleanSource, 3000, false, false); $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("abstract"), RDFtriple::Literal($LongAbstract, NULL, $this->language)); return $result; }
public function generate($subjectName, $propertyName, $value) { $result = array(); $parseResults = $this->parser->parse($value); foreach ($parseResults as $mystring) { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($mystring)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $CleanSource = $this->remove_wikicode($pageSource); $Abstract = $this->extract_abstract($CleanSource); //$LongAbstract = $this->extract_abstract($CleanSource, 3000, false, false); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2000/01/rdf-schema#comment"), RDFtriple::Literal($Abstract, NULL, $this->language)); return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); if ($this->decode_title($pageTitle) == NULL) { return $result; } $result->addTriple($this->getPageURI(), RDFtriple::URI(RDFS_LABEL, false), RDFtriple::Literal($this->decode_title($pageTitle), NULL, $this->language)); return $result; }
public function getPredicateTriples() { $predicateTriples = new ExtractionResult($this->pageID, $this->language, $this->extractorID); foreach ($this->predicates as $subject => $bool) { // array_push( $predicateTriples, new RDFtriple($subject, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property")); $predicateTriples->addTriple(RDFtriple::URI($subject), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#Property")); $predicateTriples->addTriple(RDFtriple::URI($subject), RDFtriple::URI("http://www.w3.org/2000/01/rdf-schema#label"), RDFtriple::Literal($this->getPredicateLabel($subject))); } return $predicateTriples; }
public function generate($subjectName, $propertyName, $value) { $result = array(); $image = "image"; $parsedText = ActiveAbstractExtractor::stripMarkup($value, $image); $parsedText = trim($parsedText); if ($parsedText != "") { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parsedText, null, $this->language)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); // Locate geo coordinates if (preg_match('/<geo>([\\-0-9\\.]+);([\\-0-9\\.]+)[^0-9]*[^<]*<\\/geo>/', $pageSource, $match)) { $lat = $match[1]; $long = $match[2]; $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#lat"), RDFtriple::Literal($lat, "http://www.w3.org/2001/XMLSchema#float", NULL)); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#long"), RDFtriple::Literal($long, "http://www.w3.org/2001/XMLSchema#float", NULL)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $PersonData = $this->extractPersondata($pageSource, $this->language); //var_dump($PersonData); if ($PersonData != null) { // preg_match("/\[\[en:(.*)\]\]/", $pageSource, $LangLinkmatch); // $PersonData['enPageID'] = str_replace(" ","_",$LangLinkmatch[1]); $WikiDB = new DatabaseWikipedia($this->language); preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['birthplace'], $Birthplacematch); $Birthplacematch = $this->getLinkForLabeledLink($Birthplacematch); $mySource = $WikiDB->getSource($Birthplacematch); preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch); $BirthPlace = $LangLinkmatch[1]; preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['deathplace'], $Deathplacematch); $Deathplacematch = $this->getLinkForLabeledLink($Deathplacematch); $mySource = $WikiDB->getSource($Deathplacematch); preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch); $DeathPlace = $LangLinkmatch[1]; //var_dump($PersonData); //var_dump($BirthPlace); //var_dump($DeathPlace); //var_dump($Deathplacematch); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/name"), RDFtriple::Literal($PersonData['name'], null, "de")); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/givenname"), RDFtriple::Literal($PersonData['givenname'], null, "de")); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/surname"), RDFtriple::Literal($PersonData['surname'], null, "de")); if ($BirthPlace != "") { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("birthPlace"), RDFtriple::page($BirthPlace)); // $result->addTriple( // RDFtriple::page($pageID), // RDFtriple::URI("http://purl.org/vocab/bio/0.1/event"), // RDFtriple::URI("http://dbpedia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth")); // $result->addTriple( // RDFtriple::URI("http://dbpedia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth"); // RDFtriple::URI("http://purl.org/vocab/bio/0.1/place"), // RDFtriple::page($BirthPlace)); } if ($PersonData['birthdate'] != "") { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("birth"), RDFtriple::Literal($PersonData['birthdate'], "http://www.w3.org/2001/XMLSchema#date", null)); } if ($DeathPlace != "") { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("deathPlace"), RDFtriple::page($DeathPlace)); } if ($PersonData['deathdate'] != "") { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("death"), RDFtriple::Literal($PersonData['deathdate'], "http://www.w3.org/2001/XMLSchema#date", null)); } if ($PersonData['description'] != "") { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://purl.org/dc/elements/1.1/description"), RDFtriple::Literal($PersonData['description'], null, "de")); } $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://xmlns.com/foaf/0.1/Person")); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { //create a new Extraction Result to hold all extrated Triples $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); //Look for {{chembox header}} in PageSource if (preg_match("/{{chembox header}}/", $pageSource, $match)) { //DO SOME PARSING //Add a Triple for each Property $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_MY_CHEM_PROPERTY, false), RDFtriple::Literal("my_value")); //Add each Predicate to the Predicate Collection $this->allPredicates->addPredicate("my_chem_property"); } return $result; }
public function generate($subjectName, $propertyName, $value) { $result = array(); if (strpos($value, "{{") !== false) { $value = Util::replaceTemplates($value); } $value = Util::removeHtmlTags($value); $value = Util::removeHtmlComments($value); $value = Util::removeWikiEmphasis($value); $parseResults = $this->parser->parse($value); if (isset($parseResults)) { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parseResults)); } else { //TODO: ADD DEGUB LOGFILE FOR UN-PARSED TRIPLES $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($value)); } return $result; }
public function findPND($text, $pageTitle, &$result) { $templates = Util::getTemplates($text); foreach ($templates as $template) { if ($template["name"] == "Normdaten") { preg_match('/\\|\\s*PND\\s*=\\s*([0-9X]*)/i', $template["content"], $match); if (isset($match[1])) { // add individualised PND $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_INDIVIDUALISED_PND, false), RDFtriple::Literal($match[1], NULL, NULL)); } } else { if ($template["name"] == "PND") { preg_match('/\\s*PND\\s*\\|\\s*([0-9X]*)(.*)/i', $template["content"], $match); if (isset($match)) { if (isset($match[1]) && strlen($match[1]) >= 9) { // add individualised PND $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_INDIVIDUALISED_PND, false), RDFtriple::Literal($match[1], NULL, NULL)); } if (isset($match[2])) { preg_match('/\\|\\s*([0-9X]*)/i', $match[2], $match1); if (isset($match1[1]) && strlen($match1[1]) >= 9) { // add non-individualised PND $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_NON_INDIVIDUALISED_PND, false), RDFtriple::Literal($match1[1], NULL, NULL)); } } } } else { if ($template["name"] == "PNDfehlt") { preg_match('/\\s*PNDfehlt\\s*(\\|\\s*.*)/i', $template["content"], $match); if (isset($match[1])) { preg_match('/\\|\\s*([0-9X]*)/i', $match[1], $match1); if (isset($match1[1]) && strlen($match1[1]) >= 9) { // add non-individualised PND $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_NON_INDIVIDUALISED_PND, false), RDFtriple::Literal($match1[1], NULL, NULL)); } } } } } } }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); //TODO image namespace $text = $pageSource; //TODO not sure what to take as magic number here: // 4096 was to short, e.g. inappropriate for london $text = substr($text, 0, 8192); $text = self::stripMarkup($text); //TODO REMOVE THIS LINE FOR DEBUGGING: $text = $this->_exceptions($text); //2 is probalby perfect, since it guarantuees a certain lentgh $firstTwoSentences = $this->_extractStart($text, 2); //better than nothing $fullabstract = $firstTwoSentences; //this is crazy code as it could also be 0 //it is a heuristical approach to nicen the abstracts. // take anything until you find '==' if (($pos = strpos($text, '==')) !== false) { $fullabstract = trim(substr($text, 0, $pos)); } else { $fullabstract = trim($text); } if (!empty($firstTwoSentences)) { $s = $this->getPageURI(); $p = RDFtriple::URI(DBCOMM_COMMENT, false); $o = RDFtriple::Literal($firstTwoSentences, NULL, $this->language); $this->log('debug', 'Found: ' . $s->toString() . " " . $p->toString() . " " . $o->toString()); $result->addTriple($s, $p, $o); } if (!empty($fullabstract)) { $s = $this->getPageURI(); $p = RDFtriple::URI(DBCOMM_ABSTRACT, false); $o = RDFtriple::Literal($fullabstract, NULL, $this->language); $this->log('debug', 'Found: ' . $s->toString() . " " . $p->toString() . " " . $o->toString()); $result->addTriple($s, $p, $o); } //TODO $clipped = substr( $extract, 0, 1024 ); //TODO UtfNormal::cleanUp( $clipped ); in include/normal/UtfNormal return $result; }
public function generate($subjectName, $propertyName, $value) { $result = array(); $value = Util::removeHtmlTags($value); $value = Util::removeHtmlComments($value); $parseResultArray = $this->parser->parse($value); if (isset($parseResultArray)) { foreach ($parseResultArray as $parseResults) { $parsedDataType = $parseResults[1]; if ($parsedDataType == "") { $parsedDataType = null; } if ($parseResults[0] != "") { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal((string) $parseResults[0], $parsedDataType, null)); } } } else { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($value)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $foundCoordinates = array(); /* Main title */ if ($geoInfo = $this->extractGeoInfo($pageSource, true)) { $pageG = new geo_param($geoInfo); array_push($foundCoordinates, $pageG); $this->log('debug', "Found title entry '" . implode("|", $geoInfo) . "'"); } /* Coordinates provided in infobox formats */ $infoboxes = $this->getInfoboxes($pageSource); foreach ($infoboxes[1] as $box) { $boxProperties = $this->getBoxProperties($box, true); foreach (GeoExtractor::$knownInfoboxFormats as $format) { /* Initialize global defaults */ $pieces = array(null, 0, 0, 'N', null, 0, 0, 'E'); /* Apply template-specific NS/EW defaults */ if (isset($format[8])) { /* NS default */ $pieces[3] = $format[8]; } if (isset($format[9])) { /* EW default */ $pieces[7] = $format[9]; } /* Copy from template */ for ($i = 0; $i < count($format); $i++) { $formatString = urldecode($format[$i]); if ($formatString != "" && isset($boxProperties[strtolower($formatString)])) { /* German coordinates: Treat 'O' (Ost) as 'E' (East) */ if ($i == 7 && $boxProperties[strtolower($formatString)] == 'O') { $pieces[$i] = 'E'; } else { $pieces[$i] = $boxProperties[strtolower($formatString)]; } } } if (geo_param::is_lat($pieces[0]) && geo_param::is_long($pieces[4])) { $g = new geo_param($pieces); array_push($foundCoordinates, $g); $this->log('debug', "Found format '" . $format[0] . "'"); } } /* $this->knownInfoboxFormats */ /* * Look for coordinate tags inside the infobox * Used widely, e.g. airports (=> Los_Angeles_International_Airport) * These don't have to set the title, as they're in a first-level infobox * We could simply take all references, but the idea here is to filter out * cases where multiple locations are specified */ foreach (GeoExtractor::$knownTemplatePredicates as $coord) { if (isset($boxProperties[strtolower(urldecode($coord))]) && ($geoInfo = $this->extractGeoInfo($boxProperties[strtolower(urldecode($coord))]))) { $pageG = new geo_param($geoInfo); array_push($foundCoordinates, $pageG); $this->log('debug', "Found infobox entry '" . implode("|", $geoInfo) . "'"); } } } /* infoboxes */ $numResults = 0; foreach ($foundCoordinates as $g) { if ($g->is_valid()) { if (GeoExtractor::enablePreview) { ?> <iframe src="http://maps.google.com/?q=<?php echo $g->latdeg; ?> ,<?php echo $g->londeg; ?> &z=5" style="width: 1000px; height: 600px; border: none;" scrolling="no"> </iframe> <?php } /* Only process the first result */ if (++$numResults == 1) { $sqlParams = array("resource" => $pageID, "lang" => $this->language, "point" => (string) $g->latdeg . " " . (string) $g->londeg); /* Check whether this exact entry is present */ if ($this->batchExtraction) { $results = mysql_query($this->sqlParameterize(GeoExtractor::sqlCheckExact, $sqlParams), $this->dbSharedConnection); $row = mysql_fetch_assoc($results); $hasExact = $row['count'] != '0'; if ($hasExact) { $hasAny = true; } else { /* Check whether any entry is present */ $results = mysql_query($this->sqlParameterize(GeoExtractor::sqlCheckAny, $sqlParams), $this->dbSharedConnection); $row = mysql_fetch_assoc($results); $hasAny = $row['count'] != '0'; } } else { $hasExact = $hasAny = false; } if ($hasAny) { $this->log('debug', "Not generating geocoordinates because coordinates were previously generated for this resource"); } else { /* Store in results table for duplicate detection */ if ($this->batchExtraction) { mysql_query($this->sqlParameterize(GeoExtractor::sqlInsertPoint, $sqlParams), $this->dbSharedConnection); } /* Triple generation */ /* W3C Geospatial Vocabulary (GeoRSS) */ $result->addTriple($this->getPageURI(), RDFtriple::URI(GEORSS_POINT, false), RDFtriple::Literal((string) $g->latdeg . " " . (string) $g->londeg)); /* Basic Geo Vocabulary - only add them if there are no points for the resource so far; it'd be ambigous otherwise! */ if ($hasAny) { $this->log('debug', "Not generating W3C Basic geocoordinates because basic coordinates were previously generated for this resource"); } else { $result->addTriple($this->getPageURI(), RDFtriple::URI(WGS_LAT, false), RDFtriple::Literal((string) $g->latdeg, XS_FLOAT, NULL)); $result->addTriple($this->getPageURI(), RDFtriple::URI(WGS_LONG, false), RDFtriple::Literal((string) $g->londeg, XS_FLOAT, NULL)); } /* * Process additional attributes * See http://en.wikipedia.org/wiki/Wikipedia:WikiProject_Geographical_coordinates#Parameters * and http://de.wikipedia.org/wiki/Wikipedia:WikiProjekt_Georeferenzierung/Neue_Koordinatenvorlage#Parameter */ if (isset($g->attr['type'])) { if (isset(GeoExtractor::$typeToGeoNames[$g->attr['type']])) { $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_FEATURECLASS, false), RDFtriple::URI(GEONAMES_NS . GeoExtractor::$typeToGeoNames[$g->attr['type']][0])); if (isset(GeoExtractor::$typeToGeoNames[$g->attr['type']][1])) { $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_FEATURECODE, false), RDFtriple::URI(GEONAMES_NS . GeoExtractor::$typeToGeoNames[$g->attr['type']][0] . '.' . GeoExtractor::$typeToGeoNames[$g->attr['type']][1])); } } /* city(pop): City, town or village with specified population */ if (strtolower($g->attr['type'] == 'city') && isset($g->attr['arg:type'])) { $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_POPULATION, false), RDFtriple::Literal((string) $g->attr['arg:type'], XS_INTEGER, NULL)); } /* * landmark: Cultural landmark, building of special interest, tourist attration and other points of interest * Mapped to YAGO; could in theory use http://www.eionet.europa.eu/gemet/concept/8525 ("tourist facility"), * but it seems as if any POI not matching the above categories is tagged as a landmark * (=> "Google" etc.) */ if (strtolower($g->attr['type'] == 'landmark')) { $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(YAGO_LANDMARK, false)); } } /* type */ /* population */ if (isset($g->attr['pop'])) { $result->addTriple($this->getPageURI(), RDFtriple::URI(GEO_POPULATION, false), RDFtriple::Literal((string) $g->attr['pop'], XS_INTEGER, NULL)); } /* elevation in meters above sea level * TODO must be converted from sea level *to* wgs84 ellipsoid! */ /*if (isset($g->attr['elevation'])) { $result->addTriple( $this->getPageURI(), RDFtriple::URI("http://www.georss.org/georss/elev"), RDFtriple::Literal((string) $g->attr['elevation']); }*/ /* Diameter (m) to GeoRSS radius (m) */ if (isset($g->attr['dim'])) { $result->addTriple($this->getPageURI(), RDFtriple::URI(GEORSS_RADIUS, false), RDFtriple::Literal((string) ($g->attr['dim'] / 2), XS_DECIMAL, NULL)); } /* * region: ISO 3166-1 alpha-2 country code or ISO 3166-2 region code to GeoNames inCountry * Very unreliable, as it just sets a preferred map view, * i.e. Germany's and The Czech Republic's regions are set to "EN".... */ /*if (isset($g->attr['region'])) { $result->addTriple( $this->getPageURI(), RDFtriple::URI("http://www.geonames.org/ontology#inCountry"), RDFtriple::URI("http://www.geonames.org/countries/#". substr($g->attr['region'], 0, 2))); } */ } /* !$hasExact */ } /* first entry */ } /* is_valid */ } /* $foundCoordinates */ return $result; }
/** * Write given triple to given destination. * @param $destination must not be null * @param $pageID will be turned into a RDF URI using RDFtriple::page() * @param $predicate must be a RDF URI * @param $text will be turned into a RDF literal using RDFtriple::Literal() * @return void */ private function writeTriple($destination, $pageID, $predicate, $text) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $subject = RDFtriple::page($pageID); $object = RDFtriple::Literal($text, NULL, $this->language); // $this->log('warn','Found: '.$subject->toString()." ".$predicate->toString()." ".$object->toString()); $result->addTriple($subject, $predicate, $object); Timer::start('destination:accept'); $destination->accept($result); Timer::stop('destination:accept'); }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $PersonData = $this->extractPersondata($pageSource, $this->language); //var_dump($PersonData); if ($PersonData != null) { // preg_match("/\[\[en:(.*)\]\]/", $pageSource, $LangLinkmatch); // $PersonData['enPageID'] = str_replace(" ","_",$LangLinkmatch[1]); if (Options::getOption('Persondata.usedb')) { $WikiDB = new DatabaseWikipediaCollection($this->language); } $mysource = ""; if (isset($PersonData['birthplace'])) { preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['birthplace'], $Birthplacematch); if (isset($Birthplacematch[0])) { $Birthplacematch = $this->getLinkForLabeledLink($Birthplacematch); if (Options::getOption('Persondata.usedb')) { $mySource = $WikiDB->getSource($Birthplacematch); } preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch); if (isset($LangLinkmatch[1])) { $BirthPlace = $LangLinkmatch[1]; } } } if (isset($PersonData['deathplace'])) { preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['deathplace'], $Deathplacematch); if (isset($Deathplacematch[0])) { $Deathplacematch = $this->getLinkForLabeledLink($Deathplacematch); if (Options::getOption('Persondata.usedb')) { $mySource = $WikiDB->getSource($Deathplacematch); } preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch); if (isset($LangLinkmatch[1])) { $DeathPlace = $LangLinkmatch[1]; } } } //var_dump($PersonData); //var_dump($BirthPlace); //var_dump($DeathPlace); //var_dump($Deathplacematch); if (isset($PersonData['name']) && $PersonData['name'] != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_NAME, false), RDFtriple::Literal($PersonData['name'], null, "de")); } if (isset($PersonData['givenname']) && $PersonData['givenname'] != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_GIVENNAME, false), RDFtriple::Literal($PersonData['givenname'], null, "de")); } if (isset($PersonData['surname']) && $PersonData['surname'] != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_SURNAME, false), RDFtriple::Literal($PersonData['surname'], null, "de")); } if (isset($BirthPlace) && $BirthPlace != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_BIRTHPLACE, false), RDFtriple::page($BirthPlace)); // $result->addTriple( // RDFtriple::page($pageID), // RDFtriple::URI("http://purl.org/vocab/bio/0.1/event"), // RDFtriple::URI("http://dbp edia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth")); // $result->addTriple( // RDFtriple::URI("http://dbp edia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth"); // RDFtriple::URI("http://purl.org/vocab/bio/0.1/place"), // RDFtriple::page($BirthPlace)); } if (isset($PersonData['birthdate']) && $PersonData['birthdate'] != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_BIRTH, false), RDFtriple::Literal($PersonData['birthdate'], XS_DATE, null)); } if (isset($DeathPlace) && $DeathPlace != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_DEATHPLACE, false), RDFtriple::page($DeathPlace)); } if (isset($PersonData['deathdate']) && $PersonData['deathdate'] != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_DEATH, false), RDFtriple::Literal($PersonData['deathdate'], XS_DATE, null)); } if (isset($PersonData['description']) && $PersonData['description'] != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(DC_DESCRIPTION, false), RDFtriple::Literal($PersonData['description'], null, "de")); } $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(FOAF_PERSON, false)); } return $result; }
private function addLiteral($result, $pageID, $class, $property, $value, $datatype = null, $lang = null) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::property($class, $property, $this->flagNewSchema), RDFtriple::Literal($value, $datatype, $lang)); }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2000/01/rdf-schema#label"), RDFtriple::Literal($this->decode_title($pageTitle), NULL, $this->language)); return $result; }
public function addDCModifiedAnnotation() { $p = RDFtriple::URI(DC_MODIFIED, false); $o = RDFtriple::Literal(date('c'), XS_DATETIME, ""); $this->addOWLAxiomAnnotation($p, $o); }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $foundCoordinates = array(); /* Main title */ if ($geoInfo = $this->extractGeoInfo($pageSource, true)) { $pageG = new geo_param($geoInfo); array_push($foundCoordinates, $pageG); if (GeoExtractor::enableDebug) { echo "<h3>Found title entry '" . implode("|", $geoInfo) . "'</h3>"; } } /* Coordinates provided in infobox formats */ $infoboxes = $this->getInfoboxes($pageSource); foreach ($infoboxes[1] as $box) { $boxProperties = $this->getBoxProperties($box, true); foreach ($this->knownInfoboxFormats as $format) { /* Initialize global defaults */ $pieces = array(null, 0, 0, 'N', null, 0, 0, 'E'); /* Apply template-specific NS/EW defaults */ if (isset($format[8])) { /* NS default */ $pieces[3] = $format[8]; } if (isset($format[9])) { /* EW default */ $pieces[7] = $format[9]; } /* Copy from template */ for ($i = 0; $i < count($format); $i++) { $formatString = urldecode($format[$i]); if ($formatString != "" && isset($boxProperties[strtolower($formatString)])) { /* German coordinates: Treat 'O' (Ost) as 'E' (East) */ if ($i == 7 && $boxProperties[strtolower($formatString)] == 'O') { $pieces[$i] = 'E'; } else { $pieces[$i] = $boxProperties[strtolower($formatString)]; } } } if (geo_param::is_lat($pieces[0]) && geo_param::is_long($pieces[4])) { $g = new geo_param($pieces); array_push($foundCoordinates, $g); if (GeoExtractor::enableDebug) { echo "<h3>Found format '" . $format[0] . "'</h3>"; } } } /* $this->knownInfoboxFormats */ /* * Look for coordinate tags inside the infobox * Used widely, e.g. airports (=> Los_Angeles_International_Airport) * These don't have to set the title, as they're in a first-level infobox * We could simply take all references, but the idea here is to filter out * cases where multiple locations are specified */ foreach ($this->knownTemplatePredicates as $coord) { if (isset($boxProperties[strtolower(urldecode($coord))]) && ($geoInfo = $this->extractGeoInfo($boxProperties[strtolower(urldecode($coord))]))) { $pageG = new geo_param($geoInfo); array_push($foundCoordinates, $pageG); if (GeoExtractor::enableDebug) { echo "<h3>Found infobox entry '" . implode("|", $geoInfo) . "'</h3>"; } } } } /* infoboxes */ $numResults = 0; foreach ($foundCoordinates as $g) { if ($g->is_valid()) { if (GeoExtractor::enablePreview) { ?> <iframe src="http://christianhbecker.com/da/workspace/gmap.php?lat=<?php echo $g->latdeg; ?> &long=<?php echo $g->londeg; ?> &scale=5" style="width: 500px; height: 300px; border: none;" scrolling="no"> </iframe> <?php } /* Only process the first result */ if (++$numResults == 1) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#lat"), RDFtriple::Literal((string) $g->latdeg, "http://www.w3.org/2001/XMLSchema#float", NULL)); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#long"), RDFtriple::Literal((string) $g->londeg, "http://www.w3.org/2001/XMLSchema#float", NULL)); /* * Process additional attributes * See http://en.wikipedia.org/wiki/Wikipedia:WikiProject_Geographical_coordinates#Parameters */ if (isset($g->attr['type'])) { if (isset($this->typeToGeoNames[$g->attr['type']])) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#featureClass"), RDFtriple::URI("http://www.geonames.org/ontology#" . $this->typeToGeoNames[$g->attr['type']][0])); if (isset($this->typeToGeoNames[$g->attr['type']][1])) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#featureCode"), RDFtriple::URI("http://www.geonames.org/ontology#" . $this->typeToGeoNames[$g->attr['type']][0] . '.' . $this->typeToGeoNames[$g->attr['type']][1])); } } /* city(pop): City, town or village with specified population */ if (strtolower($g->attr['type'] == 'city') && isset($g->attr['arg:type'])) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#population"), RDFtriple::Literal($g->attr['arg:type'], "http://www.w3.org/2001/XMLSchema#integer", NULL)); } /* * landmark: Cultural landmark, building of special interest, tourist attration and other points of interest * Mapped to YAGO; could in theory use http://www.eionet.europa.eu/gemet/concept/8525 ("tourist facility"), * but it seems as if any POI not matching the above categories is tagged as a landmark * (=> "Google" etc.) */ if (strtolower($g->attr['type'] == 'landmark')) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://dbpedia.org/class/yago/Landmark108624891")); } } /* type */ /* * region: ISO 3166-1 alpha-2 country code or ISO 3166-2 region code to GeoNames inCountry * Very unreliable, as it just sets a preferred map view, * i.e. Germany's and The Czech Republic's regions are set to "EN".... */ /*if (isset($g->attr['region'])) { $result->addTriple( RDFtriple::page($pageID), RDFtriple::URI("http://www.geonames.org/ontology#inCountry"), RDFtriple::URI("http://www.geonames.org/countries/#". substr($g->attr['region'], 0, 2))); } */ } } /* is_valid */ } /* $foundCoordinates */ return $result; }
/** * A wrapper for parseAttributeValue * * * @global <type> $parseResult * @param <type> $value * @param <type> $templateChildName * @param <type> $propertyName * @param <type> $language * @return <type> */ function parseAttributeValueWrapper($value, $templateChildName, $propertyName, $language) { $result = array(); global $parseResult; $parseResult = null; $localResult = parseAttributeValue($value, $templateChildName, $propertyName, $language); $items = array(); // remap local and global results into a uniform schema if (isset($parseResult)) { foreach ($parseResult as $item) { list(, , $o, $ot, $dt, $ol) = $item; $items[] = array($o, $ot, $dt, $ol); } } $parseResult = null; if (isset($localResult)) { list($o, $ot, $dt, $ol) = $localResult; $items[] = array($o, $ot, $dt, $ol); } foreach ($items as $item) { $object = $item[0]; $objectType = $item[1]; $dataType = $item[2]; $objectLanguage = $item[3]; // And another hack... we pass the language to the // parse function, and we don't get it back... if (!isset($objectLanguage)) { $objectLanguage = $language; } // special newline handling $br = array('<br>', '<br/>', '<br />'); if ($objectType == 'l') { $object = str_replace($br, "\n", $object); } else { if ($objectType == 'r') { $object = str_replace($br, '', $object); } } //echo "Got object type '$objectType'\n"; if ($objectType == "r") { $object = RDFtriple::URI($object); } else { if ($objectType == "l") { $object = RDFtriple::Literal($object, $dataType, $objectLanguage); } else { Logger::warn("Shouldn't happen - found a blank node where none expected - objectType = {$objectType}"); continue; } } $result[] = new RDFtriple(RDFtriple::page($templateChildName), RDFtriple::URI($propertyName), $object); } return $result; }