public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $category = Util::getMediaWikiNamespace($this->language, MW_CATEGORY_NAMESPACE);
     if (preg_match_all("/" . $category . ":(.*)/", $pageID, $match)) {
         $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_PREFLABEL, false), RDFtriple::Literal($this->decode_title($pageTitle), NULL, $this->language));
         $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(SKOS_CONCEPT, false));
         if (preg_match_all("/\\[\\[" . $category . ":(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 // split on | sign
                 if (strpos($match[1], '|') === false) {
                     $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($match[1]);
                 } else {
                     $split = explode('|', $match[1]);
                     $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($split[0]);
                 }
                 try {
                     $object = RDFtriple::URI($object);
                 } catch (Exception $e) {
                     echo 'Caught exception: ', $e->getMessage(), "\n";
                     continue;
                 }
                 $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_BROADER, false), $object);
             }
         }
     }
     return $result;
 }
Example #2
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     if ($this->language == "en") {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/page"), RDFtriple::URI("http://wikicompany.org/wiki/" . URI::wikipediaEncode($pageTitle)));
     }
     return $result;
 }
Example #3
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     // language code in URI uses '-', not '_'
     $language = str_replace('_', '-', $this->language);
     $subject = $this->getPageURI();
     $predicate = $language == 'en' ? RDFtriple::predicate("wikipage-" . $language) : RDFtriple::URI(FOAF_PAGE, false);
     $object = RDFtriple::URI("http://" . $language . ".wikipedia.org/wiki/" . URI::wikipediaEncode($pageTitle));
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $result->addTriple($subject, $predicate, $object);
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $org_language = $this->language;
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $query = "select page_title, page_namespace, ll.ll_lang,replace(trim(ll_title), ' ', '_')as lang_title from page p inner join langlinks ll on p.page_id = ll.ll_from where p.page_title= '" . mysql_escape_string($pageID) . "' and p.page_namespace <> 14";
     $queryresult = mysql_query($query, $this->DBlink) or die(" search unsuccessful: " . mysql_error());
     while ($row = mysql_fetch_array($queryresult, MYSQL_ASSOC)) {
         //$object="http://xxx.dbpedia.org/resource/".URI::wikipediaEncode($row["lang_title"] );
         $result->addTriple(RDFtriple::URI("http://" . $org_language . ".dbpedia.org/resource/" . URI::wikipediaEncode($pageID)), RDFtriple::URI(OWL_SAMEAS, false), RDFtriple::URI("http://" . $row["ll_lang"] . ".dbpedia.org/resource/" . URI::wikipediaEncode($row["lang_title"])));
     }
     return $result;
 }
Example #5
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $pagelinks = $this->extract_internal_links($pageSource, $this->language);
     //var_dump($pagelinks);
     $pagelinks = array_unique($pagelinks);
     foreach ($pagelinks as $LinkURI) {
         $object = DB_RESOURCE_NS . ucfirst(URI::wikipediaEncode($LinkURI));
         try {
             $object = RDFtriple::URI($object);
         } catch (Exception $e) {
             $this->log('warn', 'Caught exception: ', $e->getMessage(), "\n");
             continue;
         }
         $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_WIKILINK, false), $object);
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     global $MEDIAWIKI_DISAMBIGUATIONS_EXTENSION;
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if (Util::isDisambiguation($pageSource, $this->language)) {
         // use only links that include the name of the current page and don't include a namespace.
         // Example: http://en.wikipedia.org/wiki/User
         // - we omit [[Wikipedia:Username policy]]
         // - we include [[User (computing)]] and many others
         // - TODO: we should include [[Consumer]], but don't - it doesn't include "user"
         if (isset($MEDIAWIKI_DISAMBIGUATIONS_EXTENSION[$this->language])) {
             foreach ($MEDIAWIKI_DISAMBIGUATIONS_EXTENSION[$this->language] as $disambig) {
                 if (strpos($pageID, $disambig)) {
                     $pageIDClean = str_replace('_(' . $disambig . ')', '', $pageID);
                 }
             }
         } else {
             $pageIDClean = str_replace('_(disambiguation)', '', $pageID);
         }
         if (!isset($pageIDClean)) {
             $pageIDClean = "";
             $warn = "pageidclean not set";
         }
         $regex = '/\\[\\[([^:\\[\\]]*?' . preg_quote($pageIDClean) . '[^\\[\\]]*?)\\]\\]/i';
         if (preg_match_all($regex, $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 $object = DB_RESOURCE_NS . URI::wikipediaEncode($this->getLinkForLabeledLink($match[1]));
                 try {
                     $object = RDFtriple::URI($object);
                 } catch (Exception $e) {
                     $this->log('warn', 'Caught exception: ' . $e->getMessage() . "\n");
                     continue;
                 }
                 $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(DB_DISAMBIGUATES, false), $object);
             }
         }
     }
     if (isset($warn)) {
         $this->log('warn', $warn . " {$pageID} \n");
     }
     return $result;
 }
 /**
  * PageID Parameter must be ENCODED!
  **/
 static function resolveRedirect($pageID)
 {
     include "databaseconfig.php";
     $DBlink = mysql_connect($host, $user, $password, true) or die("Keine Verbindung moeglich: " . mysql_error());
     mysql_select_db('dbpedia_extraction', $DBlink) or die("RDFtriple: Auswahl der Datenbank fehlgeschlagen");
     mysql_query("SET NAMES utf8", $DBlink);
     $decPageID = str_replace("/", "%2F", $pageID);
     $decPageID = str_replace(":", "%3A", $decPageID);
     $decPageID = mysql_escape_string(urldecode(str_replace("_", " ", trim($decPageID))));
     $redirectquery = "select page_to from redirects where page_from = '{$decPageID}'";
     $redirectqueryresult = mysql_query($redirectquery, $DBlink) or die("RDFtriple: Anfrage redirectqueryresult fehlgeschlagen: " . mysql_error());
     $row = mysql_fetch_array($redirectqueryresult, MYSQL_ASSOC);
     $pageto = $row['page_to'];
     if (isset($pageto)) {
         $returnPageID = URI::wikipediaEncode($pageto);
     } else {
         $returnPageID = $pageID;
     }
     return $returnPageID;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $category = Util::getMediaWikiNamespace($this->language, MW_CATEGORY_NAMESPACE);
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if (!preg_match("/" . $category . ":/", $pageID, $match)) {
         if (preg_match_all("/\\[\\[" . $category . ":(.*)\\]\\]/U", $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 $Category = preg_replace("/\\|.*/", "", $match[1]);
                 $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($Category);
                 try {
                     $object = RDFtriple::URI($object);
                 } catch (Exception $e) {
                     $this->log(WARN, 'Caught exception: ', $e->getMessage(), "\n");
                     continue;
                 }
                 $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_SUBJECT, false), $object);
             }
         }
     }
     return $result;
 }
Example #9
0
 static function page($pageID)
 {
     return new URI("http://www4.wiwiss.fu-berlin.de/wikicompany/resource/" . URI::wikipediaEncode($pageID));
 }
Example #10
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     include "databaseconfig.php";
     $this->counter++;
     echo $this->counter . "\n";
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if ($this->decode_title($pageTitle) == NULL) {
         return $result;
     }
     // Remove comments
     $text = Util::removeComments($pageSource);
     // Search {{....}}
     preg_match_all('/\\{{2}((?>[^\\{\\}]+)|(?R))*\\}{2}/x', $text, $rawTemplates);
     foreach ($rawTemplates[0] as $rawTemplate) {
         if ($rawTemplate[0] != '{') {
             return $result;
         }
         // Delete {{ and }}
         $rawTemplate = substr($rawTemplate, 2, -2);
         // get template name
         preg_match_all("/([^|]*)/", $rawTemplate, $templateNames, PREG_SET_ORDER);
         $templateName = strtolower(trim($templateNames[0][0]));
         // Remove comments
         $rawTemplate = Util::removeComments($rawTemplate);
         // Replace "|" inside subtemplates to avoid splitting them like triples
         $rawTemplate = preg_replace_callback("/(\\{{2})([^\\}\\|]+)(\\|)([^\\}]+)(\\}{2})/", array($this, 'replaceBarInSubTemplate'), $rawTemplate);
         // Replace "|" inside labeled links to avoid splitting them like triples
         $check = false;
         while ($check === false) {
             $rawTemplate1 = preg_replace('/\\[\\[([^\\]]+)\\|([^\\]]*)\\]\\]/', '[[\\1***@@@***@@@***@@@***@@@\\2]]', $rawTemplate, -1, $count);
             if ($rawTemplate == $rawTemplate1) {
                 $check = true;
                 $rawTemplate = $rawTemplate1;
             } else {
                 $rawTemplate = $rawTemplate1;
             }
         }
         // Find template keyvalue pairs
         preg_match_all("/\\|\\s*\\|?\\s*([^=|<>]+)\\s*=([^|]*)/", $rawTemplate, $keyvalues, PREG_SET_ORDER);
         // my original
         //preg_match_all("/\|\s*([^=]+)\s*=?([^|]*)/", $rawTemplate, $keyvalues, PREG_SET_ORDER); // new MBE
         //preg_match_all("/\|\s*([^=]+)\s*=([^|]*)/", $rawTemplate, $keyvalues, PREG_SET_ORDER); // orginal
         // Next template if there are no keyvalue pairs
         if (count($keyvalues) == 0) {
             return $result;
         }
         foreach ($keyvalues as $keyvalue) {
             $keyvalue = str_replace('***@@@***@@@***@@@***@@@', '|', $keyvalue);
             $keyvalue = str_replace('***---***---***---***---', '|', $keyvalue);
             $propkey = trim($keyvalue[1]);
             $propvalue = trim($keyvalue[2]);
             if ($propvalue == '') {
                 continue;
             }
             $s = "http://dbpedia.org/resource/" . URI::wikipediaEncode($pageID);
             $p = "http://dbpedia.org/property/" . $this->propertyToCamelCase($propkey);
             $o = $propvalue;
             $line = "INSERT INTO propertietriples (resourceURI, propertiyURI, propertyValue) VALUES ('{$s}','" . mysql_escape_string($p) . "','" . mysql_escape_string($o) . "')";
             fWrite($this->DumpFile, $line . "\n");
         }
         // add wikiPageUsesTemplate
         $p = "http://dbpedia.org/property/wikiPageUsesTemplate";
         $o = "http://dbpedia.org/resource/Template:" . $this->encodeLocalName($templateName);
         $line = "INSERT INTO propertietriples (resourceURI, propertiyURI, propertyValue) VALUES ('{$s}','" . mysql_escape_string($p) . "','" . mysql_escape_string($o) . "')";
         fWrite($this->DumpFile, $line . "\n");
     }
     return $result;
 }
Example #11
0
 public static function page($pageID)
 {
     if (self::$pageCacheKey != $pageID) {
         $encPageID = URI::wikipediaEncode($pageID);
         $returnPageID = strtoupper(substr($encPageID, 0, 1)) . substr($encPageID, 1);
         $resourceURI = DB_RESOURCE_NS . $returnPageID;
         $uri = new URI($resourceURI);
         self::$pageCacheKey = $pageID;
         self::$pageCacheValue = $uri;
     }
     return self::$pageCacheValue;
 }