public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $category = Util::getMediaWikiNamespace($this->language, MW_CATEGORY_NAMESPACE); if (preg_match_all("/" . $category . ":(.*)/", $pageID, $match)) { $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_PREFLABEL, false), RDFtriple::Literal($this->decode_title($pageTitle), NULL, $this->language)); $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(SKOS_CONCEPT, false)); if (preg_match_all("/\\[\\[" . $category . ":(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { // split on | sign if (strpos($match[1], '|') === false) { $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($match[1]); } else { $split = explode('|', $match[1]); $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($split[0]); } try { $object = RDFtriple::URI($object); } catch (Exception $e) { echo 'Caught exception: ', $e->getMessage(), "\n"; continue; } $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_BROADER, false), $object); } } } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); if ($this->language == "en") { $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/page"), RDFtriple::URI("http://wikicompany.org/wiki/" . URI::wikipediaEncode($pageTitle))); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { // language code in URI uses '-', not '_' $language = str_replace('_', '-', $this->language); $subject = $this->getPageURI(); $predicate = $language == 'en' ? RDFtriple::predicate("wikipage-" . $language) : RDFtriple::URI(FOAF_PAGE, false); $object = RDFtriple::URI("http://" . $language . ".wikipedia.org/wiki/" . URI::wikipediaEncode($pageTitle)); $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $result->addTriple($subject, $predicate, $object); return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $org_language = $this->language; $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $query = "select page_title, page_namespace, ll.ll_lang,replace(trim(ll_title), ' ', '_')as lang_title from page p inner join langlinks ll on p.page_id = ll.ll_from where p.page_title= '" . mysql_escape_string($pageID) . "' and p.page_namespace <> 14"; $queryresult = mysql_query($query, $this->DBlink) or die(" search unsuccessful: " . mysql_error()); while ($row = mysql_fetch_array($queryresult, MYSQL_ASSOC)) { //$object="http://xxx.dbpedia.org/resource/".URI::wikipediaEncode($row["lang_title"] ); $result->addTriple(RDFtriple::URI("http://" . $org_language . ".dbpedia.org/resource/" . URI::wikipediaEncode($pageID)), RDFtriple::URI(OWL_SAMEAS, false), RDFtriple::URI("http://" . $row["ll_lang"] . ".dbpedia.org/resource/" . URI::wikipediaEncode($row["lang_title"]))); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $pagelinks = $this->extract_internal_links($pageSource, $this->language); //var_dump($pagelinks); $pagelinks = array_unique($pagelinks); foreach ($pagelinks as $LinkURI) { $object = DB_RESOURCE_NS . ucfirst(URI::wikipediaEncode($LinkURI)); try { $object = RDFtriple::URI($object); } catch (Exception $e) { $this->log('warn', 'Caught exception: ', $e->getMessage(), "\n"); continue; } $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_WIKILINK, false), $object); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { global $MEDIAWIKI_DISAMBIGUATIONS_EXTENSION; $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); if (Util::isDisambiguation($pageSource, $this->language)) { // use only links that include the name of the current page and don't include a namespace. // Example: http://en.wikipedia.org/wiki/User // - we omit [[Wikipedia:Username policy]] // - we include [[User (computing)]] and many others // - TODO: we should include [[Consumer]], but don't - it doesn't include "user" if (isset($MEDIAWIKI_DISAMBIGUATIONS_EXTENSION[$this->language])) { foreach ($MEDIAWIKI_DISAMBIGUATIONS_EXTENSION[$this->language] as $disambig) { if (strpos($pageID, $disambig)) { $pageIDClean = str_replace('_(' . $disambig . ')', '', $pageID); } } } else { $pageIDClean = str_replace('_(disambiguation)', '', $pageID); } if (!isset($pageIDClean)) { $pageIDClean = ""; $warn = "pageidclean not set"; } $regex = '/\\[\\[([^:\\[\\]]*?' . preg_quote($pageIDClean) . '[^\\[\\]]*?)\\]\\]/i'; if (preg_match_all($regex, $pageSource, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { $object = DB_RESOURCE_NS . URI::wikipediaEncode($this->getLinkForLabeledLink($match[1])); try { $object = RDFtriple::URI($object); } catch (Exception $e) { $this->log('warn', 'Caught exception: ' . $e->getMessage() . "\n"); continue; } $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(DB_DISAMBIGUATES, false), $object); } } } if (isset($warn)) { $this->log('warn', $warn . " {$pageID} \n"); } return $result; }
/** * PageID Parameter must be ENCODED! **/ static function resolveRedirect($pageID) { include "databaseconfig.php"; $DBlink = mysql_connect($host, $user, $password, true) or die("Keine Verbindung moeglich: " . mysql_error()); mysql_select_db('dbpedia_extraction', $DBlink) or die("RDFtriple: Auswahl der Datenbank fehlgeschlagen"); mysql_query("SET NAMES utf8", $DBlink); $decPageID = str_replace("/", "%2F", $pageID); $decPageID = str_replace(":", "%3A", $decPageID); $decPageID = mysql_escape_string(urldecode(str_replace("_", " ", trim($decPageID)))); $redirectquery = "select page_to from redirects where page_from = '{$decPageID}'"; $redirectqueryresult = mysql_query($redirectquery, $DBlink) or die("RDFtriple: Anfrage redirectqueryresult fehlgeschlagen: " . mysql_error()); $row = mysql_fetch_array($redirectqueryresult, MYSQL_ASSOC); $pageto = $row['page_to']; if (isset($pageto)) { $returnPageID = URI::wikipediaEncode($pageto); } else { $returnPageID = $pageID; } return $returnPageID; }
public function extractPage($pageID, $pageTitle, $pageSource) { $category = Util::getMediaWikiNamespace($this->language, MW_CATEGORY_NAMESPACE); $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); if (!preg_match("/" . $category . ":/", $pageID, $match)) { if (preg_match_all("/\\[\\[" . $category . ":(.*)\\]\\]/U", $pageSource, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { $Category = preg_replace("/\\|.*/", "", $match[1]); $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($Category); try { $object = RDFtriple::URI($object); } catch (Exception $e) { $this->log(WARN, 'Caught exception: ', $e->getMessage(), "\n"); continue; } $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_SUBJECT, false), $object); } } } return $result; }
static function page($pageID) { return new URI("http://www4.wiwiss.fu-berlin.de/wikicompany/resource/" . URI::wikipediaEncode($pageID)); }
public function extractPage($pageID, $pageTitle, $pageSource) { include "databaseconfig.php"; $this->counter++; echo $this->counter . "\n"; $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); if ($this->decode_title($pageTitle) == NULL) { return $result; } // Remove comments $text = Util::removeComments($pageSource); // Search {{....}} preg_match_all('/\\{{2}((?>[^\\{\\}]+)|(?R))*\\}{2}/x', $text, $rawTemplates); foreach ($rawTemplates[0] as $rawTemplate) { if ($rawTemplate[0] != '{') { return $result; } // Delete {{ and }} $rawTemplate = substr($rawTemplate, 2, -2); // get template name preg_match_all("/([^|]*)/", $rawTemplate, $templateNames, PREG_SET_ORDER); $templateName = strtolower(trim($templateNames[0][0])); // Remove comments $rawTemplate = Util::removeComments($rawTemplate); // Replace "|" inside subtemplates to avoid splitting them like triples $rawTemplate = preg_replace_callback("/(\\{{2})([^\\}\\|]+)(\\|)([^\\}]+)(\\}{2})/", array($this, 'replaceBarInSubTemplate'), $rawTemplate); // Replace "|" inside labeled links to avoid splitting them like triples $check = false; while ($check === false) { $rawTemplate1 = preg_replace('/\\[\\[([^\\]]+)\\|([^\\]]*)\\]\\]/', '[[\\1***@@@***@@@***@@@***@@@\\2]]', $rawTemplate, -1, $count); if ($rawTemplate == $rawTemplate1) { $check = true; $rawTemplate = $rawTemplate1; } else { $rawTemplate = $rawTemplate1; } } // Find template keyvalue pairs preg_match_all("/\\|\\s*\\|?\\s*([^=|<>]+)\\s*=([^|]*)/", $rawTemplate, $keyvalues, PREG_SET_ORDER); // my original //preg_match_all("/\|\s*([^=]+)\s*=?([^|]*)/", $rawTemplate, $keyvalues, PREG_SET_ORDER); // new MBE //preg_match_all("/\|\s*([^=]+)\s*=([^|]*)/", $rawTemplate, $keyvalues, PREG_SET_ORDER); // orginal // Next template if there are no keyvalue pairs if (count($keyvalues) == 0) { return $result; } foreach ($keyvalues as $keyvalue) { $keyvalue = str_replace('***@@@***@@@***@@@***@@@', '|', $keyvalue); $keyvalue = str_replace('***---***---***---***---', '|', $keyvalue); $propkey = trim($keyvalue[1]); $propvalue = trim($keyvalue[2]); if ($propvalue == '') { continue; } $s = "http://dbpedia.org/resource/" . URI::wikipediaEncode($pageID); $p = "http://dbpedia.org/property/" . $this->propertyToCamelCase($propkey); $o = $propvalue; $line = "INSERT INTO propertietriples (resourceURI, propertiyURI, propertyValue) VALUES ('{$s}','" . mysql_escape_string($p) . "','" . mysql_escape_string($o) . "')"; fWrite($this->DumpFile, $line . "\n"); } // add wikiPageUsesTemplate $p = "http://dbpedia.org/property/wikiPageUsesTemplate"; $o = "http://dbpedia.org/resource/Template:" . $this->encodeLocalName($templateName); $line = "INSERT INTO propertietriples (resourceURI, propertiyURI, propertyValue) VALUES ('{$s}','" . mysql_escape_string($p) . "','" . mysql_escape_string($o) . "')"; fWrite($this->DumpFile, $line . "\n"); } return $result; }
public static function page($pageID) { if (self::$pageCacheKey != $pageID) { $encPageID = URI::wikipediaEncode($pageID); $returnPageID = strtoupper(substr($encPageID, 0, 1)) . substr($encPageID, 1); $resourceURI = DB_RESOURCE_NS . $returnPageID; $uri = new URI($resourceURI); self::$pageCacheKey = $pageID; self::$pageCacheValue = $uri; } return self::$pageCacheValue; }