function __construct($metainfo)
 {
     $this->uri = RDFtriple::page($metainfo['pageTitle']);
     $this->language = $metainfo['language'];
     $this->oaiId = $metainfo['oaiId'];
     $this->graphURI = Options::getOption('graphURI');
     $this->annotationGraphURI = Options::getOption('annotationGraphURI');
     $this->generateOWLAxiomAnnotations = Options::getOption('generateOWLAxiomAnnotations');
     $this->languageProperties = Options::getOption('stringPredicateWithForeignlanguages');
     $this->debug_turn_off_insert = Options::getOption('debug_turn_off_insert');
     $this->debug_run_tests = Options::getOption('debug_run_tests');
     $this->hash = new Hash($this->oaiId, $this->uri->getURI());
     //$this->metainfo = $metainfo;
     $this->subjectSPARULpattern = $this->uri->toSPARULPattern();
     if (Options::isOptionSet('predicateFilter')) {
         $p = Options::getOption('predicateFilter');
         $this->log(WARN, 'currently not working');
     }
     if (Options::isOptionSet('objectFilter')) {
         $o = Options::getOption('objectFilter');
         $this->log(WARN, 'currently not working');
     }
     if (Options::isOptionSet('predicateObjectFilter')) {
         $po = Options::getOption('predicateObjectFilter');
         $this->log(WARN, 'currently not working');
     }
 }
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $parseResults = $this->parser->parse($value);
     ob_start();
     $str = "Date parser \n";
     $str .= "value was: {$value} \n";
     print_r($parseResults);
     $str .= ob_get_contents();
     ob_end_clean();
     Logger::debug($str);
     if (!isset($parseResults)) {
         return $result;
     }
     $datePattern = "/\\d\\d\\d\\d-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])/";
     /*
             if(preg_match($datePattern, $parseResults) != 1)
                 return $result;
     
             $isValidDate = checkdate(
                 substr($parseResults, 5, 2),
                 substr($parseResults, 8, 2),
                 substr($parseResults, 0, 4));
     */
     $isValidDate = true;
     if ($isValidDate) {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parseResults[0], $parseResults[1], null));
         //print_r($result);
     } else {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parseResults, null, null));
     }
     return $result;
 }
Example #3
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $pageID = encodeLocalName($pageID);
     // Remove Template as this is already extracted by the Infobox Extractor
     // Find subtemplates and remove Subtemplates, which are listed as ignored!
     preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $pageSource, $subTemplates);
     foreach ($subTemplates[0] as $key => $subTemplate) {
         $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate);
         // Cut Brackets / {}
         $pageSource = str_replace('{{' . $subTemplate . '}}', '', $pageSource);
     }
     // Extract internal Semantic Links
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-z0-9\\- _]+)(::)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate(encodeLocalName($match[2])), RDFtriple::page($match[4]));
     }
     // Extract Literals
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-Z\\-_ ]+)(:=)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $triple = array();
         $triple = parseAttributeValue($match[4], $pageID, $match[2]);
         // object, object_is, datatype(, language)
         $lexicalForm = $triple[0];
         $datatype = $triple[2];
         $predicate = propertyToCamelCase(encodeLocalName($match[2]));
         // Continue if empty String
         if ($lexicalForm == null) {
             continue;
         }
         $result->addTriple(RDFtriple::page($pageID), RDFTriple::predicate($predicate), RDFtriple::literal($lexicalForm, $datatype, 'en'));
     }
     return $result;
 }
Example #4
0
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $value = Util::replaceWikiLinks($value);
     //TODO: WARUM NUR IN DIESEM FALL CITE RAUSNEHMEN?
     preg_match_all("/{{2}cite.*?\\}{2}/i", $value, $matches);
     foreach ($matches as $match) {
         if (!array_key_exists(0, $match)) {
             continue;
         }
         $value = str_replace($match[0], Util::replaceTemplates($match[0]), $value);
     }
     $value = Util::removeHtmlTags($value);
     $value = Util::removeHtmlComments($value);
     // get unit exact type
     // Some arguments have a fixed type - e.g. weight_lb -> pounds
     $unitExactType = null;
     //UnitValueParser::parseValue($propvalue, $this->language, array($unit_type, $unit_exact_type, $propkey));
     $parseResultArray = $this->parser->parse($value);
     if (isset($parseResultArray)) {
         foreach ($parseResultArray as $parseResults) {
             $parsedDataType = $parseResults[1];
             if ($parsedDataType == "") {
                 $parsedDataType = null;
             }
             $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal((string) $parseResults[0], $parsedDataType, null));
         }
     } else {
         //TODO: GENERATE LOGFILE WITH UNPARSED VALUES
         $result[] = new RDFTriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($value));
     }
     return $result;
 }
Example #5
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $image_ar = $this->extract_image_url($pageSource);
     $image = ucfirst($image_ar[0]);
     $width = $image_ar[1];
     if ($image == null) {
         return $result;
     }
     $ImageURL = $this->make_image_url($image, false, true);
     $ImageURLSmall = $this->make_image_url($image, $width);
     $image = str_replace(" ", "_", trim($image));
     if (!URI::validate($ImageURL) || !URI::validate($ImageURLSmall)) {
         return $result;
     }
     // Add fullsize image
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(FOAF_DEPICTION), RDFtriple::URI($ImageURL));
     // Add depiction has thumbnail
     $result->addTriple(RDFtriple::URI($ImageURL), RDFtriple::URI(FOAF_THUMBNAIL), RDFtriple::URI($ImageURLSmall));
     // Add object has thumbnail
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(DBO_THUMBNAIL), RDFtriple::URI($ImageURLSmall));
     // add triples linking back to the Wikipedia image description
     $image = urlencode($image);
     $wikipediaImageDescription = 'http://' . $this->language . '.wikipedia.org/wiki/Image:' . $image;
     $result->addTriple(RDFtriple::URI($ImageURLSmall), RDFtriple::URI(DC_RIGHTS), RDFtriple::URI($wikipediaImageDescription));
     $result->addTriple(RDFtriple::URI($ImageURL), RDFtriple::URI(DC_RIGHTS), RDFtriple::URI($wikipediaImageDescription));
     return $result;
 }
Example #6
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     //Look if the Article has a category
     if (!preg_match("/" . WIKIMEDIA_CATEGORY . ":/", $pageID, $match)) {
         //match all categories
         if (preg_match_all("/\\[\\[" . WIKIMEDIA_CATEGORY . ":(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) {
             //empty ClassArray
             $this->ClassArray = array();
             foreach ($matches as $match) {
                 //remove the category-label
                 $Category = preg_replace("/\\|.*/", "", $match[1]);
                 $query = "select Arg2 from facts where Relation = 'subClassOf' and Arg1 = 'wikicategory_" . mysql_escape_string(str_replace(" ", "_", $Category)) . "'";
                 $queryresult = mysql_query($query, $this->DBlink) or die("Anfrage fehlgeschlagen: " . mysql_error());
                 while ($row = mysql_fetch_array($queryresult, MYSQL_ASSOC)) {
                     $this->ClassArray[$row["Arg2"]] = true;
                 }
             }
             foreach ($this->ClassArray as $subject => $bool) {
                 $YagoClass = str_replace("wordnet_", "", $subject);
                 $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(DB_YAGO_NS . $this->camel($YagoClass, "_")));
             }
         }
     }
     return $result;
 }
Example #7
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     if ($this->language == "en") {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/page"), RDFtriple::URI("http://wikicompany.org/wiki/" . URI::wikipediaEncode($pageTitle)));
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $CleanSource = $this->remove_wikicode($pageSource);
     $LongAbstract = $this->extract_abstract($CleanSource, 3000, false, false);
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("abstract"), RDFtriple::Literal($LongAbstract, NULL, $this->language));
     return $result;
 }
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $parseResults = $this->parser->parse($value);
     foreach ($parseResults as $mystring) {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($mystring));
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $CleanSource = $this->remove_wikicode($pageSource);
     $Abstract = $this->extract_abstract($CleanSource);
     //$LongAbstract = $this->extract_abstract($CleanSource, 3000, false, false);
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2000/01/rdf-schema#comment"), RDFtriple::Literal($Abstract, NULL, $this->language));
     return $result;
 }
Example #11
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $pageID = encodeLocalName($pageID);
     // Extract Wikipedia Link
     if (preg_match('/\\{\\{wikipedia\\-c(\\-note)?\\}\\}/', $pageSource)) {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2002/07/owl#sameAs"), RDFtriple::URI("http://dbpedia.org/resource/" . $pageID));
     }
     return $result;
 }
Example #12
0
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $value = Util::removeWikiEmphasis($value);
     //TODO:ADD LANGUAGE AS PARAM
     $parseResults = $this->parser->parse($value);
     foreach ($parseResults as $r) {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::page($r));
     }
     return $result;
 }
Example #13
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     // Add fullsize image
     $ImageURL = $this->extract_image_url($pageSource, $pageTitle);
     if ($ImageURL == null || !URI::validate($ImageURL)) {
         return $result;
     }
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/depiction"), RDFtriple::URI($ImageURL));
     return $result;
 }
Example #14
0
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $image = "image";
     $parsedText = ActiveAbstractExtractor::stripMarkup($value, $image);
     $parsedText = trim($parsedText);
     if ($parsedText != "") {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parsedText, null, $this->language));
     }
     return $result;
 }
Example #15
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $PersonData = $this->extractPersondata($pageSource, $this->language);
     //var_dump($PersonData);
     if ($PersonData != null) {
         // preg_match("/\[\[en:(.*)\]\]/", $pageSource, $LangLinkmatch);
         // $PersonData['enPageID'] = str_replace(" ","_",$LangLinkmatch[1]);
         $WikiDB = new DatabaseWikipedia($this->language);
         preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['birthplace'], $Birthplacematch);
         $Birthplacematch = $this->getLinkForLabeledLink($Birthplacematch);
         $mySource = $WikiDB->getSource($Birthplacematch);
         preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch);
         $BirthPlace = $LangLinkmatch[1];
         preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['deathplace'], $Deathplacematch);
         $Deathplacematch = $this->getLinkForLabeledLink($Deathplacematch);
         $mySource = $WikiDB->getSource($Deathplacematch);
         preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch);
         $DeathPlace = $LangLinkmatch[1];
         //var_dump($PersonData);
         //var_dump($BirthPlace);
         //var_dump($DeathPlace);
         //var_dump($Deathplacematch);
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/name"), RDFtriple::Literal($PersonData['name'], null, "de"));
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/givenname"), RDFtriple::Literal($PersonData['givenname'], null, "de"));
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/surname"), RDFtriple::Literal($PersonData['surname'], null, "de"));
         if ($BirthPlace != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("birthPlace"), RDFtriple::page($BirthPlace));
             // $result->addTriple(
             // RDFtriple::page($pageID),
             // RDFtriple::URI("http://purl.org/vocab/bio/0.1/event"),
             // RDFtriple::URI("http://dbpedia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth"));
             // $result->addTriple(
             // RDFtriple::URI("http://dbpedia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth");
             // RDFtriple::URI("http://purl.org/vocab/bio/0.1/place"),
             // RDFtriple::page($BirthPlace));
         }
         if ($PersonData['birthdate'] != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("birth"), RDFtriple::Literal($PersonData['birthdate'], "http://www.w3.org/2001/XMLSchema#date", null));
         }
         if ($DeathPlace != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("deathPlace"), RDFtriple::page($DeathPlace));
         }
         if ($PersonData['deathdate'] != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("death"), RDFtriple::Literal($PersonData['deathdate'], "http://www.w3.org/2001/XMLSchema#date", null));
         }
         if ($PersonData['description'] != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://purl.org/dc/elements/1.1/description"), RDFtriple::Literal($PersonData['description'], null, "de"));
         }
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://xmlns.com/foaf/0.1/Person"));
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $extlinks = $this->extract_external_links($pageSource, $this->language);
     while (list($ExtURL, $ExtName) = each($extlinks)) {
         if (!URI::validate($ExtURL)) {
             continue;
         }
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("reference"), RDFtriple::URI($ExtURL));
     }
     return $result;
 }
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $links = $this->parser->parse($value);
     foreach ($links as $link) {
         $link = $this->mediaWikiUtil->toCanonicalWikiCase($link);
         $link = encodeLocalName($link);
         $resource = $this->basePath . $link;
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::URI($resource));
     }
     return $result;
 }
Example #18
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if ($this->language == "en") {
         $query = "select wsl.url from templatelinks tl inner join page p on p.page_id = tl.tl_from\r\n\t\t\t\t\t\tinner join dbpedia_develop.wordnet_mapping wm on tl.tl_title = wm.infobox\r\n\t\t\t\t\t\tinner join dbpedia_develop.wordnet_synsets_links wsl on wm.ID1 = wsl.synset30ID\r\n\t\t\t\t\t\twhere p.page_title = '" . mysql_escape_string($pageID) . "' and p.page_namespace = 0";
         $queryresult = mysql_query($query, $this->DBlink) or die("Query failed:\n{$query}\n" . mysql_error());
         while ($row = mysql_fetch_array($queryresult, MYSQL_ASSOC)) {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("wordnet_type"), RDFtriple::URI($row["url"]));
         }
     }
     return $result;
 }
Example #19
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     // Locate geo coordinates
     if (preg_match('/<geo>([\\-0-9\\.]+);([\\-0-9\\.]+)[^0-9]*[^<]*<\\/geo>/', $pageSource, $match)) {
         $lat = $match[1];
         $long = $match[2];
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#lat"), RDFtriple::Literal($lat, "http://www.w3.org/2001/XMLSchema#float", NULL));
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#long"), RDFtriple::Literal($long, "http://www.w3.org/2001/XMLSchema#float", NULL));
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     if (!preg_match("/Category:/", $pageID, $match)) {
         if (preg_match_all("/\\[\\[Category:(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 $Category = preg_replace("/\\|.*/", "", $match[1]);
                 $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2004/02/skos/core#subject"), RDFtriple::page("Category:" . $Category));
             }
         }
     }
     return $result;
 }
Example #21
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     //create a new Extraction Result to hold all extrated Triples
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     //Look for {{chembox header}} in PageSource
     if (preg_match("/{{chembox header}}/", $pageSource, $match)) {
         //DO SOME PARSING
         //Add a Triple for each Property
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("my_chem_property"), RDFtriple::Literal("my_value"));
         //Add each Predicate to the Predicate Collection
         $this->allPredicates->addPredicate("my_chem_property");
     }
     return $result;
 }
Example #22
0
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     if (strpos($value, "{{") !== false) {
         $value = Util::replaceTemplates($value);
     }
     $value = Util::removeHtmlTags($value);
     $value = Util::removeHtmlComments($value);
     $value = Util::removeWikiEmphasis($value);
     $parseResults = $this->parser->parse($value);
     if (isset($parseResults)) {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($parseResults));
     } else {
         //TODO: ADD DEGUB LOGFILE FOR UN-PARSED TRIPLES
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($value));
     }
     return $result;
 }
Example #23
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $foundLinks = array();
     /* Look in infoboxes */
     $infoboxes = $this->getInfoboxes($pageSource);
     foreach ($infoboxes[1] as $box) {
         $boxProperties = $this->getBoxProperties($box, true);
         foreach ($this->knownHomepagePredicates as $pred) {
             $key = strtolower(urldecode($pred));
             if (isset($boxProperties[$key])) {
                 $foundLinks[] = $this->parseURL($boxProperties[$key], true);
                 if (HomepageExtractor::enableDebug) {
                     echo "<h3>Found box property '" . $pred . "'</h3>";
                 }
             }
         }
     }
     /* Process "External links" */
     if (isset($this->externalLinkSections[$this->language])) {
         preg_match('/(==+\\s*' . $this->externalLinkSections[$this->language] . '\\s*==+(?:.(?!==+[^=]+==+))*)/s', $pageSource, $matches);
         preg_match_all('/\\*\\s*([^\\n]*)/', $matches[1], $links);
         $linkDesignationsPattern = '/\\b(' . implode('|', $this->knownLinkDesignations) . ')\\b/i';
         foreach ($links[1] as $link) {
             if (preg_match($linkDesignationsPattern, $link)) {
                 $foundLinks[] = $this->parseURL($link, true);
             }
         }
     }
     $numResults = 0;
     foreach ($foundLinks as $link) {
         if (URI::validate($link)) {
             if (HomepageExtractor::enableDebug) {
                 echo "<h3>Found link {$link}</h3>";
             }
             /* Only process the first result */
             if (++$numResults == 1) {
                 $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/homepage"), RDFtriple::URI($link));
             }
         }
     }
     return $result;
 }
Example #24
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if (Util::isRedirect($pageSource, $this->language)) {
         if (preg_match("/\\[\\[(.*?)\\]\\]/", $pageSource, $matches) === 1) {
             try {
                 $s = $this->getPageURI();
                 $p = RDFtriple::URI(DB_REDIRECT, false);
                 $o = RDFtriple::page($this->getLinkForLabeledLink($matches[1]));
                 $result->addTriple($s, $p, $o);
             } catch (Exception $e) {
                 // exception is thrown when URIs are not valid, in this case we just
                 // do nothing i.e. do not write the triple
                 $this->log(INFO, $o->getURI() . ' is an invalid uri');
             }
         }
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     global $MEDIAWIKI_DISAMBIGUATIONS_EXTENSION;
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if (Util::isDisambiguation($pageSource, $this->language)) {
         // use only links that include the name of the current page and don't include a namespace.
         // Example: http://en.wikipedia.org/wiki/User
         // - we omit [[Wikipedia:Username policy]]
         // - we include [[User (computing)]] and many others
         // - TODO: we should include [[Consumer]], but don't - it doesn't include "user"
         if (isset($MEDIAWIKI_DISAMBIGUATIONS_EXTENSION[$this->language])) {
             foreach ($MEDIAWIKI_DISAMBIGUATIONS_EXTENSION[$this->language] as $disambig) {
                 if (strpos($pageID, $disambig)) {
                     $pageIDClean = str_replace('_(' . $disambig . ')', '', $pageID);
                 }
             }
         } else {
             $pageIDClean = str_replace('_(disambiguation)', '', $pageID);
         }
         if (!isset($pageIDClean)) {
             $pageIDClean = "";
             $warn = "pageidclean not set";
         }
         $regex = '/\\[\\[([^:\\[\\]]*?' . preg_quote($pageIDClean) . '[^\\[\\]]*?)\\]\\]/i';
         if (preg_match_all($regex, $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 $object = DB_RESOURCE_NS . URI::wikipediaEncode($this->getLinkForLabeledLink($match[1]));
                 try {
                     $object = RDFtriple::URI($object);
                 } catch (Exception $e) {
                     $this->log('warn', 'Caught exception: ' . $e->getMessage() . "\n");
                     continue;
                 }
                 $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(DB_DISAMBIGUATES, false), $object);
             }
         }
     }
     if (isset($warn)) {
         $this->log('warn', $warn . " {$pageID} \n");
     }
     return $result;
 }
Example #26
0
 public function findPND($text, $pageTitle, &$result)
 {
     $templates = Util::getTemplates($text);
     foreach ($templates as $template) {
         if ($template["name"] == "Normdaten") {
             preg_match('/\\|\\s*PND\\s*=\\s*([0-9X]*)/i', $template["content"], $match);
             if (isset($match[1])) {
                 // add individualised PND
                 $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_INDIVIDUALISED_PND, false), RDFtriple::Literal($match[1], NULL, NULL));
             }
         } else {
             if ($template["name"] == "PND") {
                 preg_match('/\\s*PND\\s*\\|\\s*([0-9X]*)(.*)/i', $template["content"], $match);
                 if (isset($match)) {
                     if (isset($match[1]) && strlen($match[1]) >= 9) {
                         // add individualised PND
                         $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_INDIVIDUALISED_PND, false), RDFtriple::Literal($match[1], NULL, NULL));
                     }
                     if (isset($match[2])) {
                         preg_match('/\\|\\s*([0-9X]*)/i', $match[2], $match1);
                         if (isset($match1[1]) && strlen($match1[1]) >= 9) {
                             // add non-individualised PND
                             $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_NON_INDIVIDUALISED_PND, false), RDFtriple::Literal($match1[1], NULL, NULL));
                         }
                     }
                 }
             } else {
                 if ($template["name"] == "PNDfehlt") {
                     preg_match('/\\s*PNDfehlt\\s*(\\|\\s*.*)/i', $template["content"], $match);
                     if (isset($match[1])) {
                         preg_match('/\\|\\s*([0-9X]*)/i', $match[1], $match1);
                         if (isset($match1[1]) && strlen($match1[1]) >= 9) {
                             // add non-individualised PND
                             $result->addTriple(RDFtriple::page($this->pageID), RDFtriple::URI(DBO_NON_INDIVIDUALISED_PND, false), RDFtriple::Literal($match1[1], NULL, NULL));
                         }
                     }
                 }
             }
         }
     }
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if (Util::isRedirect($pageSource, $this->language)) {
         if (preg_match_all("/\\[\\[([^\\]]*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 $s = RDFtriple::page($pageID);
                 $p = RDFtriple::predicate("redirect");
                 $o = RDFtriple::page($this->getLinkForLabeledLink($match[1]));
                 $templateredirecturi = str_replace("template:", "Template:", str_replace("'", "\\'", str_replace(" ", "_", mb_strtolower($match[1]))));
                 $templateredirecturi = DB_RESOURCE_NS . $templateredirecturi;
                 $query = "select * from template_uri where uri = '{$templateredirecturi}'";
                 $dbresult = mysql_query($query, $this->link) or die("Query failed: " . mysql_error() . ' - ' . $query);
                 $uri = "";
                 while ($row = mysql_fetch_array($dbresult, MYSQL_ASSOC)) {
                     $uri = $row['uri'];
                     $template_id = $row['template_id'];
                     /*
                     echo "$this->redirectTemplateCounter: $pageID => $match[1]";
                     echo " $uri (FOUND)";
                     echo "\n";				
                     */
                     $newtemplateuri = DB_RESOURCE_NS . str_replace("template:", "Template:", str_replace("'", "\\'", str_replace(" ", "_", mb_strtolower($pageID))));
                     $query_template_uri = "select * from template_uri where uri = '{$newtemplateuri}'";
                     $dbresult_template_uri = mysql_query($query_template_uri, $this->link) or die("Query failed: " . mysql_error() . ' - ' . $query_template_uri);
                     if (mysql_num_rows($dbresult_template_uri) > 0) {
                         //echo "$pageID already in DB";
                     } else {
                         $insertquery = "INSERT INTO template_uri (template_id, uri) VALUES ('" . $template_id . "', '" . $newtemplateuri . "')";
                         mysql_query($insertquery, $this->link) or die("Query failed: " . mysql_error() . ' - ' . $insertquery);
                         $this->redirectTemplateCounter++;
                         echo "{$this->redirectTemplateCounter}: {$pageID} => {$match['1']}";
                         echo " {$newtemplateuri} (FOUND)";
                         echo "\n";
                     }
                 }
             }
         }
     }
     return $result;
 }
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $value = Util::removeHtmlTags($value);
     $value = Util::removeHtmlComments($value);
     $parseResultArray = $this->parser->parse($value);
     if (isset($parseResultArray)) {
         foreach ($parseResultArray as $parseResults) {
             $parsedDataType = $parseResults[1];
             if ($parsedDataType == "") {
                 $parsedDataType = null;
             }
             if ($parseResults[0] != "") {
                 $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal((string) $parseResults[0], $parsedDataType, null));
             }
         }
     } else {
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::Literal($value));
     }
     return $result;
 }
Example #29
0
 /**
  * Write given triple to given destination.
  * @param $destination must not be null
  * @param $pageID will be turned into a RDF URI using RDFtriple::page()
  * @param $predicate must be a RDF URI
  * @param $text will be turned into a RDF literal using RDFtriple::Literal()
  * @return void
  */
 private function writeTriple($destination, $pageID, $predicate, $text)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $subject = RDFtriple::page($pageID);
     $object = RDFtriple::Literal($text, NULL, $this->language);
     // $this->log('warn','Found: '.$subject->toString()." ".$predicate->toString()." ".$object->toString());
     $result->addTriple($subject, $predicate, $object);
     Timer::start('destination:accept');
     $destination->accept($result);
     Timer::stop('destination:accept');
 }
Example #30
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $PersonData = $this->extractPersondata($pageSource, $this->language);
     //var_dump($PersonData);
     if ($PersonData != null) {
         // preg_match("/\[\[en:(.*)\]\]/", $pageSource, $LangLinkmatch);
         // $PersonData['enPageID'] = str_replace(" ","_",$LangLinkmatch[1]);
         if (Options::getOption('Persondata.usedb')) {
             $WikiDB = new DatabaseWikipediaCollection($this->language);
         }
         $mysource = "";
         if (isset($PersonData['birthplace'])) {
             preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['birthplace'], $Birthplacematch);
             if (isset($Birthplacematch[0])) {
                 $Birthplacematch = $this->getLinkForLabeledLink($Birthplacematch);
                 if (Options::getOption('Persondata.usedb')) {
                     $mySource = $WikiDB->getSource($Birthplacematch);
                 }
                 preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch);
                 if (isset($LangLinkmatch[1])) {
                     $BirthPlace = $LangLinkmatch[1];
                 }
             }
         }
         if (isset($PersonData['deathplace'])) {
             preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['deathplace'], $Deathplacematch);
             if (isset($Deathplacematch[0])) {
                 $Deathplacematch = $this->getLinkForLabeledLink($Deathplacematch);
                 if (Options::getOption('Persondata.usedb')) {
                     $mySource = $WikiDB->getSource($Deathplacematch);
                 }
                 preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch);
                 if (isset($LangLinkmatch[1])) {
                     $DeathPlace = $LangLinkmatch[1];
                 }
             }
         }
         //var_dump($PersonData);
         //var_dump($BirthPlace);
         //var_dump($DeathPlace);
         //var_dump($Deathplacematch);
         if (isset($PersonData['name']) && $PersonData['name'] != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_NAME, false), RDFtriple::Literal($PersonData['name'], null, "de"));
         }
         if (isset($PersonData['givenname']) && $PersonData['givenname'] != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_GIVENNAME, false), RDFtriple::Literal($PersonData['givenname'], null, "de"));
         }
         if (isset($PersonData['surname']) && $PersonData['surname'] != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_SURNAME, false), RDFtriple::Literal($PersonData['surname'], null, "de"));
         }
         if (isset($BirthPlace) && $BirthPlace != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_BIRTHPLACE, false), RDFtriple::page($BirthPlace));
             // $result->addTriple(
             // RDFtriple::page($pageID),
             // RDFtriple::URI("http://purl.org/vocab/bio/0.1/event"),
             // RDFtriple::URI("http://dbp     edia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth"));
             // $result->addTriple(
             // RDFtriple::URI("http://dbp     edia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth");
             // RDFtriple::URI("http://purl.org/vocab/bio/0.1/place"),
             // RDFtriple::page($BirthPlace));
         }
         if (isset($PersonData['birthdate']) && $PersonData['birthdate'] != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_BIRTH, false), RDFtriple::Literal($PersonData['birthdate'], XS_DATE, null));
         }
         if (isset($DeathPlace) && $DeathPlace != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_DEATHPLACE, false), RDFtriple::page($DeathPlace));
         }
         if (isset($PersonData['deathdate']) && $PersonData['deathdate'] != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_DEATH, false), RDFtriple::Literal($PersonData['deathdate'], XS_DATE, null));
         }
         if (isset($PersonData['description']) && $PersonData['description'] != "") {
             $result->addTriple($this->getPageURI(), RDFtriple::URI(DC_DESCRIPTION, false), RDFtriple::Literal($PersonData['description'], null, "de"));
         }
         $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(FOAF_PERSON, false));
     }
     return $result;
 }