Example #1
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $pageID = encodeLocalName($pageID);
     // Remove Template as this is already extracted by the Infobox Extractor
     // Find subtemplates and remove Subtemplates, which are listed as ignored!
     preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $pageSource, $subTemplates);
     foreach ($subTemplates[0] as $key => $subTemplate) {
         $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate);
         // Cut Brackets / {}
         $pageSource = str_replace('{{' . $subTemplate . '}}', '', $pageSource);
     }
     // Extract internal Semantic Links
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-z0-9\\- _]+)(::)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate(encodeLocalName($match[2])), RDFtriple::page($match[4]));
     }
     // Extract Literals
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-Z\\-_ ]+)(:=)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $triple = array();
         $triple = parseAttributeValue($match[4], $pageID, $match[2]);
         // object, object_is, datatype(, language)
         $lexicalForm = $triple[0];
         $datatype = $triple[2];
         $predicate = propertyToCamelCase(encodeLocalName($match[2]));
         // Continue if empty String
         if ($lexicalForm == null) {
             continue;
         }
         $result->addTriple(RDFtriple::page($pageID), RDFTriple::predicate($predicate), RDFtriple::literal($lexicalForm, $datatype, 'en'));
     }
     return $result;
 }
Example #2
0
 /**
  * @param $shortDestination destination for short abstract.
  * If null, no short abstracts will be extracted.
  * @param $longDestination destination for long abstract.
  * If null, no long abstracts will be extracted.
  */
 public function __construct()
 {
     parent::__construct();
     //no validation required
     $this->shortPredicate = RDFtriple::URI(RDFS_COMMENT, false);
     $this->longPredicate = RDFtriple::predicate("abstract");
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $CleanSource = $this->remove_wikicode($pageSource);
     $LongAbstract = $this->extract_abstract($CleanSource, 3000, false, false);
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("abstract"), RDFtriple::Literal($LongAbstract, NULL, $this->language));
     return $result;
 }
Example #4
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     // language code in URI uses '-', not '_'
     $language = str_replace('_', '-', $this->language);
     $subject = $this->getPageURI();
     $predicate = $language == 'en' ? RDFtriple::predicate("wikipage-" . $language) : RDFtriple::URI(FOAF_PAGE, false);
     $object = RDFtriple::URI("http://" . $language . ".wikipedia.org/wiki/" . URI::wikipediaEncode($pageTitle));
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $result->addTriple($subject, $predicate, $object);
     return $result;
 }
Example #5
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if ($this->language == "en") {
         $query = "select wsl.url from templatelinks tl inner join page p on p.page_id = tl.tl_from\r\n\t\t\t\t\t\tinner join dbpedia_develop.wordnet_mapping wm on tl.tl_title = wm.infobox\r\n\t\t\t\t\t\tinner join dbpedia_develop.wordnet_synsets_links wsl on wm.ID1 = wsl.synset30ID\r\n\t\t\t\t\t\twhere p.page_title = '" . mysql_escape_string($pageID) . "' and p.page_namespace = 0";
         $queryresult = mysql_query($query, $this->DBlink) or die("Query failed:\n{$query}\n" . mysql_error());
         while ($row = mysql_fetch_array($queryresult, MYSQL_ASSOC)) {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("wordnet_type"), RDFtriple::URI($row["url"]));
         }
     }
     return $result;
 }
Example #6
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $PersonData = $this->extractPersondata($pageSource, $this->language);
     //var_dump($PersonData);
     if ($PersonData != null) {
         // preg_match("/\[\[en:(.*)\]\]/", $pageSource, $LangLinkmatch);
         // $PersonData['enPageID'] = str_replace(" ","_",$LangLinkmatch[1]);
         $WikiDB = new DatabaseWikipedia($this->language);
         preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['birthplace'], $Birthplacematch);
         $Birthplacematch = $this->getLinkForLabeledLink($Birthplacematch);
         $mySource = $WikiDB->getSource($Birthplacematch);
         preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch);
         $BirthPlace = $LangLinkmatch[1];
         preg_match("/\\[\\[([^\\]]*)\\]\\]/", $PersonData['deathplace'], $Deathplacematch);
         $Deathplacematch = $this->getLinkForLabeledLink($Deathplacematch);
         $mySource = $WikiDB->getSource($Deathplacematch);
         preg_match("/\\[\\[en:(.*)\\]\\]/", $mySource, $LangLinkmatch);
         $DeathPlace = $LangLinkmatch[1];
         //var_dump($PersonData);
         //var_dump($BirthPlace);
         //var_dump($DeathPlace);
         //var_dump($Deathplacematch);
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/name"), RDFtriple::Literal($PersonData['name'], null, "de"));
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/givenname"), RDFtriple::Literal($PersonData['givenname'], null, "de"));
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/surname"), RDFtriple::Literal($PersonData['surname'], null, "de"));
         if ($BirthPlace != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("birthPlace"), RDFtriple::page($BirthPlace));
             // $result->addTriple(
             // RDFtriple::page($pageID),
             // RDFtriple::URI("http://purl.org/vocab/bio/0.1/event"),
             // RDFtriple::URI("http://dbpedia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth"));
             // $result->addTriple(
             // RDFtriple::URI("http://dbpedia.org/resource/" . URI::wikipediaEncode($pageID) . "/Birth");
             // RDFtriple::URI("http://purl.org/vocab/bio/0.1/place"),
             // RDFtriple::page($BirthPlace));
         }
         if ($PersonData['birthdate'] != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("birth"), RDFtriple::Literal($PersonData['birthdate'], "http://www.w3.org/2001/XMLSchema#date", null));
         }
         if ($DeathPlace != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("deathPlace"), RDFtriple::page($DeathPlace));
         }
         if ($PersonData['deathdate'] != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("death"), RDFtriple::Literal($PersonData['deathdate'], "http://www.w3.org/2001/XMLSchema#date", null));
         }
         if ($PersonData['description'] != "") {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://purl.org/dc/elements/1.1/description"), RDFtriple::Literal($PersonData['description'], null, "de"));
         }
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://xmlns.com/foaf/0.1/Person"));
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $extlinks = $this->extract_external_links($pageSource, $this->language);
     while (list($ExtURL, $ExtName) = each($extlinks)) {
         if (!URI::validate($ExtURL)) {
             continue;
         }
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("reference"), RDFtriple::URI($ExtURL));
     }
     return $result;
 }
Example #8
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     //create a new Extraction Result to hold all extrated Triples
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     //Look for {{chembox header}} in PageSource
     if (preg_match("/{{chembox header}}/", $pageSource, $match)) {
         //DO SOME PARSING
         //Add a Triple for each Property
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("my_chem_property"), RDFtriple::Literal("my_value"));
         //Add each Predicate to the Predicate Collection
         $this->allPredicates->addPredicate("my_chem_property");
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if (Util::isRedirect($pageSource, $this->language)) {
         if (preg_match_all("/\\[\\[([^\\]]*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 $s = RDFtriple::page($pageID);
                 $p = RDFtriple::predicate("redirect");
                 $o = RDFtriple::page($this->getLinkForLabeledLink($match[1]));
                 $templateredirecturi = str_replace("template:", "Template:", str_replace("'", "\\'", str_replace(" ", "_", mb_strtolower($match[1]))));
                 $templateredirecturi = DB_RESOURCE_NS . $templateredirecturi;
                 $query = "select * from template_uri where uri = '{$templateredirecturi}'";
                 $dbresult = mysql_query($query, $this->link) or die("Query failed: " . mysql_error() . ' - ' . $query);
                 $uri = "";
                 while ($row = mysql_fetch_array($dbresult, MYSQL_ASSOC)) {
                     $uri = $row['uri'];
                     $template_id = $row['template_id'];
                     /*
                     echo "$this->redirectTemplateCounter: $pageID => $match[1]";
                     echo " $uri (FOUND)";
                     echo "\n";				
                     */
                     $newtemplateuri = DB_RESOURCE_NS . str_replace("template:", "Template:", str_replace("'", "\\'", str_replace(" ", "_", mb_strtolower($pageID))));
                     $query_template_uri = "select * from template_uri where uri = '{$newtemplateuri}'";
                     $dbresult_template_uri = mysql_query($query_template_uri, $this->link) or die("Query failed: " . mysql_error() . ' - ' . $query_template_uri);
                     if (mysql_num_rows($dbresult_template_uri) > 0) {
                         //echo "$pageID already in DB";
                     } else {
                         $insertquery = "INSERT INTO template_uri (template_id, uri) VALUES ('" . $template_id . "', '" . $newtemplateuri . "')";
                         mysql_query($insertquery, $this->link) or die("Query failed: " . mysql_error() . ' - ' . $insertquery);
                         $this->redirectTemplateCounter++;
                         echo "{$this->redirectTemplateCounter}: {$pageID} => {$match['1']}";
                         echo " {$newtemplateuri} (FOUND)";
                         echo "\n";
                     }
                 }
             }
         }
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("characterCount"), RDFtriple::literal(strlen($pageSource)));
     return $result;
 }