function __construct($metainfo) { $this->uri = RDFtriple::page($metainfo['pageTitle']); $this->language = $metainfo['language']; $this->oaiId = $metainfo['oaiId']; $this->graphURI = Options::getOption('graphURI'); $this->annotationGraphURI = Options::getOption('annotationGraphURI'); $this->generateOWLAxiomAnnotations = Options::getOption('generateOWLAxiomAnnotations'); $this->languageProperties = Options::getOption('stringPredicateWithForeignlanguages'); $this->debug_turn_off_insert = Options::getOption('debug_turn_off_insert'); $this->debug_run_tests = Options::getOption('debug_run_tests'); $this->hash = new Hash($this->oaiId, $this->uri->getURI()); //$this->metainfo = $metainfo; $this->subjectSPARULpattern = $this->uri->toSPARULPattern(); if (Options::isOptionSet('predicateFilter')) { $p = Options::getOption('predicateFilter'); $this->log(WARN, 'currently not working'); } if (Options::isOptionSet('objectFilter')) { $o = Options::getOption('objectFilter'); $this->log(WARN, 'currently not working'); } if (Options::isOptionSet('predicateObjectFilter')) { $po = Options::getOption('predicateObjectFilter'); $this->log(WARN, 'currently not working'); } }
public function generate($subjectName, $propertyName, $value) { $result = array(); $parseResults = $this->parser->parse($value); ob_start(); $str = "Date parser \n"; $str .= "value was: {$value} \n"; print_r($parseResults); $str .= ob_get_contents(); ob_end_clean(); Logger::debug($str); if (!isset($parseResults)) { return $result; } $datePattern = "/\\d\\d\\d\\d-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])/"; /* if(preg_match($datePattern, $parseResults) != 1) return $result; $isValidDate = checkdate( substr($parseResults, 5, 2), substr($parseResults, 8, 2), substr($parseResults, 0, 4)); */ $isValidDate = true; if ($isValidDate) { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parseResults[0], $parseResults[1], null)); //print_r($result); } else { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parseResults, null, null)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $pageID = encodeLocalName($pageID); // Remove Template as this is already extracted by the Infobox Extractor // Find subtemplates and remove Subtemplates, which are listed as ignored! preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $pageSource, $subTemplates); foreach ($subTemplates[0] as $key => $subTemplate) { $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate); // Cut Brackets / {} $pageSource = str_replace('{{' . $subTemplate . '}}', '', $pageSource); } // Extract internal Semantic Links $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-z0-9\\- _]+)(::)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate(encodeLocalName($match[2])), RDFtriple::page($match[4])); } // Extract Literals $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-Z\\-_ ]+)(:=)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $triple = array(); $triple = parseAttributeValue($match[4], $pageID, $match[2]); // object, object_is, datatype(, language) $lexicalForm = $triple[0]; $datatype = $triple[2]; $predicate = propertyToCamelCase(encodeLocalName($match[2])); // Continue if empty String if ($lexicalForm == null) { continue; } $result->addTriple(RDFtriple::page($pageID), RDFTriple::predicate($predicate), RDFtriple::literal($lexicalForm, $datatype, 'en')); } return $result; }
public function generate($subjectName, $propertyName, $value) { $result = array(); $value = Util::replaceWikiLinks($value); //TODO: WARUM NUR IN DIESEM FALL CITE RAUSNEHMEN? preg_match_all("/{{2}cite.*?\\}{2}/i", $value, $matches); foreach ($matches as $match) { if (!array_key_exists(0, $match)) { continue; } $value = str_replace($match[0], Util::replaceTemplates($match[0]), $value); } $value = Util::removeHtmlTags($value); $value = Util::removeHtmlComments($value); // get unit exact type // Some arguments have a fixed type - e.g. weight_lb -> pounds $unitExactType = null; //UnitValueParser::parseValue($propvalue, $this->language, array($unit_type, $unit_exact_type, $propkey)); $parseResultArray = $this->parser->parse($value); if (isset($parseResultArray)) { foreach ($parseResultArray as $parseResults) { $parsedDataType = $parseResults[1]; if ($parsedDataType == "") { $parsedDataType = null; } $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal((string) $parseResults[0], $parsedDataType, null)); } } else { //TODO: GENERATE LOGFILE WITH UNPARSED VALUES $result[] = new RDFTriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($value)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $image_ar = $this->extract_image_url($pageSource); $image = ucfirst($image_ar[0]); $width = $image_ar[1]; if ($image == null) { return $result; } $ImageURL = $this->make_image_url($image, false, true); $ImageURLSmall = $this->make_image_url($image, $width); $image = str_replace(" ", "_", trim($image)); if (!URI::validate($ImageURL) || !URI::validate($ImageURLSmall)) { return $result; } // Add fullsize image $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(FOAF_DEPICTION), RDFtriple::URI($ImageURL)); // Add depiction has thumbnail $result->addTriple(RDFtriple::URI($ImageURL), RDFtriple::URI(FOAF_THUMBNAIL), RDFtriple::URI($ImageURLSmall)); // Add object has thumbnail $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(DBO_THUMBNAIL), RDFtriple::URI($ImageURLSmall)); // add triples linking back to the Wikipedia image description $image = urlencode($image); $wikipediaImageDescription = 'http://' . $this->language . '.wikipedia.org/wiki/Image:' . $image; $result->addTriple(RDFtriple::URI($ImageURLSmall), RDFtriple::URI(DC_RIGHTS), RDFtriple::URI($wikipediaImageDescription)); $result->addTriple(RDFtriple::URI($ImageURL), RDFtriple::URI(DC_RIGHTS), RDFtriple::URI($wikipediaImageDescription)); return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); //Look if the Article has a category if (!preg_match("/" . WIKIMEDIA_CATEGORY . ":/", $pageID, $match)) { //match all categories if (preg_match_all("/\\[\\[" . WIKIMEDIA_CATEGORY . ":(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) { //empty ClassArray $this->ClassArray = array(); foreach ($matches as $match) { //remove the category-label $Category = preg_replace("/\\|.*/", "", $match[1]); $query = "select Arg2 from facts where Relation = 'subClassOf' and Arg1 = 'wikicategory_" . mysql_escape_string(str_replace(" ", "_", $Category)) . "'"; $queryresult = mysql_query($query, $this->DBlink) or die("Anfrage fehlgeschlagen: " . mysql_error()); while ($row = mysql_fetch_array($queryresult, MYSQL_ASSOC)) { $this->ClassArray[$row["Arg2"]] = true; } } foreach ($this->ClassArray as $subject => $bool) { $YagoClass = str_replace("wordnet_", "", $subject); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(DB_YAGO_NS . $this->camel($YagoClass, "_"))); } } } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); if ($this->language == "en") { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/page"), RDFtriple::URI("http://wikicompany.org/wiki/" . URI::wikipediaEncode($pageTitle))); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $CleanSource = $this->remove_wikicode($pageSource); $LongAbstract = $this->extract_abstract($CleanSource, 3000, false, false); $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("abstract"), RDFtriple::Literal($LongAbstract, NULL, $this->language)); return $result; }
public function generate($subjectName, $propertyName, $value) { $result = array(); $parseResults = $this->parser->parse($value); foreach ($parseResults as $mystring) { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($mystring)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $CleanSource = $this->remove_wikicode($pageSource); $Abstract = $this->extract_abstract($CleanSource); //$LongAbstract = $this->extract_abstract($CleanSource, 3000, false, false); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2000/01/rdf-schema#comment"), RDFtriple::Literal($Abstract, NULL, $this->language)); return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $pageID = encodeLocalName($pageID); // Extract Wikipedia Link if (preg_match('/\\{\\{wikipedia\\-c(\\-note)?\\}\\}/', $pageSource)) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2002/07/owl#sameAs"), RDFtriple::URI("http://dbpedia.org/resource/" . $pageID)); } return $result; }
public function generate($subjectName, $propertyName, $value) { $result = array(); $value = Util::removeWikiEmphasis($value); //TODO:ADD LANGUAGE AS PARAM $parseResults = $this->parser->parse($value); foreach ($parseResults as $r) { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::page($r)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); // Add fullsize image $ImageURL = $this->extract_image_url($pageSource, $pageTitle); if ($ImageURL == null || !URI::validate($ImageURL)) { return $result; } $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/depiction"), RDFtriple::URI($ImageURL)); return $result; }
public function generate($subjectName, $propertyName, $value) { $result = array(); $image = "image"; $parsedText = ActiveAbstractExtractor::stripMarkup($value, $image); $parsedText = trim($parsedText); if ($parsedText != "") { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parsedText, null, $this->language)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $PersonData = $this->extractPersondata($pageSource, $this->language); //var_dump($PersonData); if ($PersonData != null) { // preg_match("/\[\[en:(.*)\]\]/", $pageSource, $LangLinkmatch); // $PersonData['enPageID'] = str_replace(" ","_",$LangLinkmatch[1]); $WikiDB = new DatabaseWikipedia($this->language); preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['birthplace'], $Birthplacematch); $Birthplacematch = $this->getLinkForLabeledLink($Birthplacematch); $mySource = $WikiDB->getSource($Birthplacematch); preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch); $BirthPlace = $LangLinkmatch[1]; preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['deathplace'], $Deathplacematch); $Deathplacematch = $this->getLinkForLabeledLink($Deathplacematch); $mySource = $WikiDB->getSource($Deathplacematch); preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch); $DeathPlace = $LangLinkmatch[1]; //var_dump($PersonData); //var_dump($BirthPlace); //var_dump($DeathPlace); //var_dump($Deathplacematch); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/name"), RDFtriple::Literal($PersonData['name'], null, "de")); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/givenname"), RDFtriple::Literal($PersonData['givenname'], null, "de")); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/surname"), RDFtriple::Literal($PersonData['surname'], null, "de")); if ($BirthPlace != "") { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("birthPlace"), RDFtriple::page($BirthPlace)); // $result->addTriple( // RDFtriple::page($pageID), // RDFtriple::URI("http://purl.org/vocab/bio/0.1/event"), // RDFtriple::URI("http://dbpedia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth")); // $result->addTriple( // RDFtriple::URI("http://dbpedia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth"); // RDFtriple::URI("http://purl.org/vocab/bio/0.1/place"), // RDFtriple::page($BirthPlace)); } if ($PersonData['birthdate'] != "") { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("birth"), RDFtriple::Literal($PersonData['birthdate'], "http://www.w3.org/2001/XMLSchema#date", null)); } if ($DeathPlace != "") { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("deathPlace"), RDFtriple::page($DeathPlace)); } if ($PersonData['deathdate'] != "") { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("death"), RDFtriple::Literal($PersonData['deathdate'], "http://www.w3.org/2001/XMLSchema#date", null)); } if ($PersonData['description'] != "") { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://purl.org/dc/elements/1.1/description"), RDFtriple::Literal($PersonData['description'], null, "de")); } $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://xmlns.com/foaf/0.1/Person")); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $extlinks = $this->extract_external_links($pageSource, $this->language); while (list($ExtURL, $ExtName) = each($extlinks)) { if (!URI::validate($ExtURL)) { continue; } $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("reference"), RDFtriple::URI($ExtURL)); } return $result; }
public function generate($subjectName, $propertyName, $value) { $result = array(); $links = $this->parser->parse($value); foreach ($links as $link) { $link = $this->mediaWikiUtil->toCanonicalWikiCase($link); $link = encodeLocalName($link); $resource = $this->basePath . $link; $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::URI($resource)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); if ($this->language == "en") { $query = "select wsl.url from templatelinks tl inner join page p on p.page_id = tl.tl_from\r\n\t\t\t\t\t\tinner join dbpedia_develop.wordnet_mapping wm on tl.tl_title = wm.infobox\r\n\t\t\t\t\t\tinner join dbpedia_develop.wordnet_synsets_links wsl on wm.ID1 = wsl.synset30ID\r\n\t\t\t\t\t\twhere p.page_title = '" . mysql_escape_string($pageID) . "' and p.page_namespace = 0"; $queryresult = mysql_query($query, $this->DBlink) or die("Query failed:\n{$query}\n" . mysql_error()); while ($row = mysql_fetch_array($queryresult, MYSQL_ASSOC)) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("wordnet_type"), RDFtriple::URI($row["url"])); } } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); // Locate geo coordinates if (preg_match('/<geo>([\\-0-9\\.]+);([\\-0-9\\.]+)[^0-9]*[^<]*<\\/geo>/', $pageSource, $match)) { $lat = $match[1]; $long = $match[2]; $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#lat"), RDFtriple::Literal($lat, "http://www.w3.org/2001/XMLSchema#float", NULL)); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#long"), RDFtriple::Literal($long, "http://www.w3.org/2001/XMLSchema#float", NULL)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); if (!preg_match("/Category:/", $pageID, $match)) { if (preg_match_all("/\\[\\[Category:(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { $Category = preg_replace("/\\|.*/", "", $match[1]); $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2004/02/skos/core#subject"), RDFtriple::page("Category:" . $Category)); } } } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { //create a new Extraction Result to hold all extrated Triples $result = new ExtractionResult($pageID, $this->language, self::extractorID); //Look for {{chembox header}} in PageSource if (preg_match("/{{chembox header}}/", $pageSource, $match)) { //DO SOME PARSING //Add a Triple for each Property $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("my_chem_property"), RDFtriple::Literal("my_value")); //Add each Predicate to the Predicate Collection $this->allPredicates->addPredicate("my_chem_property"); } return $result; }
public function generate($subjectName, $propertyName, $value) { $result = array(); if (strpos($value, "{{") !== false) { $value = Util::replaceTemplates($value); } $value = Util::removeHtmlTags($value); $value = Util::removeHtmlComments($value); $value = Util::removeWikiEmphasis($value); $parseResults = $this->parser->parse($value); if (isset($parseResults)) { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parseResults)); } else { //TODO: ADD DEGUB LOGFILE FOR UN-PARSED TRIPLES $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($value)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $foundLinks = array(); /* Look in infoboxes */ $infoboxes = $this->getInfoboxes($pageSource); foreach ($infoboxes[1] as $box) { $boxProperties = $this->getBoxProperties($box, true); foreach ($this->knownHomepagePredicates as $pred) { $key = strtolower(urldecode($pred)); if (isset($boxProperties[$key])) { $foundLinks[] = $this->parseURL($boxProperties[$key], true); if (HomepageExtractor::enableDebug) { echo "<h3>Found box property '" . $pred . "'</h3>"; } } } } /* Process "External links" */ if (isset($this->externalLinkSections[$this->language])) { preg_match('/(==+\\s*' . $this->externalLinkSections[$this->language] . '\\s*==+(?:.(?!==+[^=]+==+))*)/s', $pageSource, $matches); preg_match_all('/\\*\\s*([^\\n]*)/', $matches[1], $links); $linkDesignationsPattern = '/\\b(' . implode('|', $this->knownLinkDesignations) . ')\\b/i'; foreach ($links[1] as $link) { if (preg_match($linkDesignationsPattern, $link)) { $foundLinks[] = $this->parseURL($link, true); } } } $numResults = 0; foreach ($foundLinks as $link) { if (URI::validate($link)) { if (HomepageExtractor::enableDebug) { echo "<h3>Found link {$link}</h3>"; } /* Only process the first result */ if (++$numResults == 1) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/homepage"), RDFtriple::URI($link)); } } } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); if (Util::isRedirect($pageSource, $this->language)) { if (preg_match("/\\[\\[(.*?)\\]\\]/", $pageSource, $matches) === 1) { try { $s = $this->getPageURI(); $p = RDFtriple::URI(DB_REDIRECT, false); $o = RDFtriple::page($this->getLinkForLabeledLink($matches[1])); $result->addTriple($s, $p, $o); } catch (Exception $e) { // exception is thrown when URIs are not valid, in this case we just // do nothing i.e. do not write the triple $this->log(INFO, $o->getURI() . ' is an invalid uri'); } } } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { global $MEDIAWIKI_DISAMBIGUATIONS_EXTENSION; $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); if (Util::isDisambiguation($pageSource, $this->language)) { // use only links that include the name of the current page and don't include a namespace. // Example: http://en.wikipedia.org/wiki/User // - we omit [[Wikipedia:Username policy]] // - we include [[User (computing)]] and many others // - TODO: we should include [[Consumer]], but don't - it doesn't include "user" if (isset($MEDIAWIKI_DISAMBIGUATIONS_EXTENSION[$this->language])) { foreach ($MEDIAWIKI_DISAMBIGUATIONS_EXTENSION[$this->language] as $disambig) { if (strpos($pageID, $disambig)) { $pageIDClean = str_replace('_(' . $disambig . ')', '', $pageID); } } } else { $pageIDClean = str_replace('_(disambiguation)', '', $pageID); } if (!isset($pageIDClean)) { $pageIDClean = ""; $warn = "pageidclean not set"; } $regex = '/\\[\\[([^:\\[\\]]*?' . preg_quote($pageIDClean) . '[^\\[\\]]*?)\\]\\]/i'; if (preg_match_all($regex, $pageSource, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { $object = DB_RESOURCE_NS . URI::wikipediaEncode($this->getLinkForLabeledLink($match[1])); try { $object = RDFtriple::URI($object); } catch (Exception $e) { $this->log('warn', 'Caught exception: ' . $e->getMessage() . "\n"); continue; } $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(DB_DISAMBIGUATES, false), $object); } } } if (isset($warn)) { $this->log('warn', $warn . " {$pageID} \n"); } return $result; }
public function findPND($text, $pageTitle, &$result) { $templates = Util::getTemplates($text); foreach ($templates as $template) { if ($template["name"] == "Normdaten") { preg_match('/\\|\\s*PND\\s*=\\s*([0-9X]*)/i', $template["content"], $match); if (isset($match[1])) { // add individualised PND $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_INDIVIDUALISED_PND, false), RDFtriple::Literal($match[1], NULL, NULL)); } } else { if ($template["name"] == "PND") { preg_match('/\\s*PND\\s*\\|\\s*([0-9X]*)(.*)/i', $template["content"], $match); if (isset($match)) { if (isset($match[1]) && strlen($match[1]) >= 9) { // add individualised PND $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_INDIVIDUALISED_PND, false), RDFtriple::Literal($match[1], NULL, NULL)); } if (isset($match[2])) { preg_match('/\\|\\s*([0-9X]*)/i', $match[2], $match1); if (isset($match1[1]) && strlen($match1[1]) >= 9) { // add non-individualised PND $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_NON_INDIVIDUALISED_PND, false), RDFtriple::Literal($match1[1], NULL, NULL)); } } } } else { if ($template["name"] == "PNDfehlt") { preg_match('/\\s*PNDfehlt\\s*(\\|\\s*.*)/i', $template["content"], $match); if (isset($match[1])) { preg_match('/\\|\\s*([0-9X]*)/i', $match[1], $match1); if (isset($match1[1]) && strlen($match1[1]) >= 9) { // add non-individualised PND $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_NON_INDIVIDUALISED_PND, false), RDFtriple::Literal($match1[1], NULL, NULL)); } } } } } } }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); if (Util::isRedirect($pageSource, $this->language)) { if (preg_match_all("/\\[\\[([^\\]]*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { $s = RDFtriple::page($pageID); $p = RDFtriple::predicate("redirect"); $o = RDFtriple::page($this->getLinkForLabeledLink($match[1])); $templateredirecturi = str_replace("template:", "Template:", str_replace("'", "\\'", str_replace(" ", "_", mb_strtolower($match[1])))); $templateredirecturi = DB_RESOURCE_NS . $templateredirecturi; $query = "select * from template_uri where uri = '{$templateredirecturi}'"; $dbresult = mysql_query($query, $this->link) or die("Query failed: " . mysql_error() . ' - ' . $query); $uri = ""; while ($row = mysql_fetch_array($dbresult, MYSQL_ASSOC)) { $uri = $row['uri']; $template_id = $row['template_id']; /* echo "$this->redirectTemplateCounter: $pageID => $match[1]"; echo " $uri (FOUND)"; echo "\n"; */ $newtemplateuri = DB_RESOURCE_NS . str_replace("template:", "Template:", str_replace("'", "\\'", str_replace(" ", "_", mb_strtolower($pageID)))); $query_template_uri = "select * from template_uri where uri = '{$newtemplateuri}'"; $dbresult_template_uri = mysql_query($query_template_uri, $this->link) or die("Query failed: " . mysql_error() . ' - ' . $query_template_uri); if (mysql_num_rows($dbresult_template_uri) > 0) { //echo "$pageID already in DB"; } else { $insertquery = "INSERT INTO template_uri (template_id, uri) VALUES ('" . $template_id . "', '" . $newtemplateuri . "')"; mysql_query($insertquery, $this->link) or die("Query failed: " . mysql_error() . ' - ' . $insertquery); $this->redirectTemplateCounter++; echo "{$this->redirectTemplateCounter}: {$pageID} => {$match['1']}"; echo " {$newtemplateuri} (FOUND)"; echo "\n"; } } } } } return $result; }
public function generate($subjectName, $propertyName, $value) { $result = array(); $value = Util::removeHtmlTags($value); $value = Util::removeHtmlComments($value); $parseResultArray = $this->parser->parse($value); if (isset($parseResultArray)) { foreach ($parseResultArray as $parseResults) { $parsedDataType = $parseResults[1]; if ($parsedDataType == "") { $parsedDataType = null; } if ($parseResults[0] != "") { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal((string) $parseResults[0], $parsedDataType, null)); } } } else { $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($value)); } return $result; }
/** * Write given triple to given destination. * @param $destination must not be null * @param $pageID will be turned into a RDF URI using RDFtriple::page() * @param $predicate must be a RDF URI * @param $text will be turned into a RDF literal using RDFtriple::Literal() * @return void */ private function writeTriple($destination, $pageID, $predicate, $text) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $subject = RDFtriple::page($pageID); $object = RDFtriple::Literal($text, NULL, $this->language); // $this->log('warn','Found: '.$subject->toString()." ".$predicate->toString()." ".$object->toString()); $result->addTriple($subject, $predicate, $object); Timer::start('destination:accept'); $destination->accept($result); Timer::stop('destination:accept'); }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $PersonData = $this->extractPersondata($pageSource, $this->language); //var_dump($PersonData); if ($PersonData != null) { // preg_match("/\[\[en:(.*)\]\]/", $pageSource, $LangLinkmatch); // $PersonData['enPageID'] = str_replace(" ","_",$LangLinkmatch[1]); if (Options::getOption('Persondata.usedb')) { $WikiDB = new DatabaseWikipediaCollection($this->language); } $mysource = ""; if (isset($PersonData['birthplace'])) { preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['birthplace'], $Birthplacematch); if (isset($Birthplacematch[0])) { $Birthplacematch = $this->getLinkForLabeledLink($Birthplacematch); if (Options::getOption('Persondata.usedb')) { $mySource = $WikiDB->getSource($Birthplacematch); } preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch); if (isset($LangLinkmatch[1])) { $BirthPlace = $LangLinkmatch[1]; } } } if (isset($PersonData['deathplace'])) { preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['deathplace'], $Deathplacematch); if (isset($Deathplacematch[0])) { $Deathplacematch = $this->getLinkForLabeledLink($Deathplacematch); if (Options::getOption('Persondata.usedb')) { $mySource = $WikiDB->getSource($Deathplacematch); } preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch); if (isset($LangLinkmatch[1])) { $DeathPlace = $LangLinkmatch[1]; } } } //var_dump($PersonData); //var_dump($BirthPlace); //var_dump($DeathPlace); //var_dump($Deathplacematch); if (isset($PersonData['name']) && $PersonData['name'] != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_NAME, false), RDFtriple::Literal($PersonData['name'], null, "de")); } if (isset($PersonData['givenname']) && $PersonData['givenname'] != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_GIVENNAME, false), RDFtriple::Literal($PersonData['givenname'], null, "de")); } if (isset($PersonData['surname']) && $PersonData['surname'] != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_SURNAME, false), RDFtriple::Literal($PersonData['surname'], null, "de")); } if (isset($BirthPlace) && $BirthPlace != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_BIRTHPLACE, false), RDFtriple::page($BirthPlace)); // $result->addTriple( // RDFtriple::page($pageID), // RDFtriple::URI("http://purl.org/vocab/bio/0.1/event"), // RDFtriple::URI("http://dbp edia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth")); // $result->addTriple( // RDFtriple::URI("http://dbp edia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth"); // RDFtriple::URI("http://purl.org/vocab/bio/0.1/place"), // RDFtriple::page($BirthPlace)); } if (isset($PersonData['birthdate']) && $PersonData['birthdate'] != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_BIRTH, false), RDFtriple::Literal($PersonData['birthdate'], XS_DATE, null)); } if (isset($DeathPlace) && $DeathPlace != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_DEATHPLACE, false), RDFtriple::page($DeathPlace)); } if (isset($PersonData['deathdate']) && $PersonData['deathdate'] != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_DEATH, false), RDFtriple::Literal($PersonData['deathdate'], XS_DATE, null)); } if (isset($PersonData['description']) && $PersonData['description'] != "") { $result->addTriple($this->getPageURI(), RDFtriple::URI(DC_DESCRIPTION, false), RDFtriple::Literal($PersonData['description'], null, "de")); } $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(FOAF_PERSON, false)); } return $result; }