Example #1
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     //Look if the Article has a category
     if (!preg_match("/" . WIKIMEDIA_CATEGORY . ":/", $pageID, $match)) {
         //match all categories
         if (preg_match_all("/\\[\\[" . WIKIMEDIA_CATEGORY . ":(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) {
             //empty ClassArray
             $this->ClassArray = array();
             foreach ($matches as $match) {
                 //remove the category-label
                 $Category = preg_replace("/\\|.*/", "", $match[1]);
                 $query = "select Arg2 from facts where Relation = 'subClassOf' and Arg1 = 'wikicategory_" . mysql_escape_string(str_replace(" ", "_", $Category)) . "'";
                 $queryresult = mysql_query($query, $this->DBlink) or die("Anfrage fehlgeschlagen: " . mysql_error());
                 while ($row = mysql_fetch_array($queryresult, MYSQL_ASSOC)) {
                     $this->ClassArray[$row["Arg2"]] = true;
                 }
             }
             foreach ($this->ClassArray as $subject => $bool) {
                 $YagoClass = str_replace("wordnet_", "", $subject);
                 $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(DB_YAGO_NS . $this->camel($YagoClass, "_")));
             }
         }
     }
     return $result;
 }
Example #2
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $pageID = encodeLocalName($pageID);
     // Remove Template as this is already extracted by the Infobox Extractor
     // Find subtemplates and remove Subtemplates, which are listed as ignored!
     preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $pageSource, $subTemplates);
     foreach ($subTemplates[0] as $key => $subTemplate) {
         $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate);
         // Cut Brackets / {}
         $pageSource = str_replace('{{' . $subTemplate . '}}', '', $pageSource);
     }
     // Extract internal Semantic Links
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-z0-9\\- _]+)(::)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate(encodeLocalName($match[2])), RDFtriple::page($match[4]));
     }
     // Extract Literals
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-Z\\-_ ]+)(:=)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $triple = array();
         $triple = parseAttributeValue($match[4], $pageID, $match[2]);
         // object, object_is, datatype(, language)
         $lexicalForm = $triple[0];
         $datatype = $triple[2];
         $predicate = propertyToCamelCase(encodeLocalName($match[2]));
         // Continue if empty String
         if ($lexicalForm == null) {
             continue;
         }
         $result->addTriple(RDFtriple::page($pageID), RDFTriple::predicate($predicate), RDFtriple::literal($lexicalForm, $datatype, 'en'));
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $category = Util::getMediaWikiNamespace($this->language, MW_CATEGORY_NAMESPACE);
     if (preg_match_all("/" . $category . ":(.*)/", $pageID, $match)) {
         $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_PREFLABEL, false), RDFtriple::Literal($this->decode_title($pageTitle), NULL, $this->language));
         $result->addTriple($this->getPageURI(), RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(SKOS_CONCEPT, false));
         if (preg_match_all("/\\[\\[" . $category . ":(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 // split on | sign
                 if (strpos($match[1], '|') === false) {
                     $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($match[1]);
                 } else {
                     $split = explode('|', $match[1]);
                     $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($split[0]);
                 }
                 try {
                     $object = RDFtriple::URI($object);
                 } catch (Exception $e) {
                     echo 'Caught exception: ', $e->getMessage(), "\n";
                     continue;
                 }
                 $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_BROADER, false), $object);
             }
         }
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $CleanSource = $this->remove_wikicode($pageSource);
     $LongAbstract = $this->extract_abstract($CleanSource, 3000, false, false);
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("abstract"), RDFtriple::Literal($LongAbstract, NULL, $this->language));
     return $result;
 }
Example #5
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     if ($this->language == "en") {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/page"), RDFtriple::URI("http://wikicompany.org/wiki/" . URI::wikipediaEncode($pageTitle)));
     }
     return $result;
 }
Example #6
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if ($this->decode_title($pageTitle) == NULL) {
         return $result;
     }
     $result->addTriple($this->getPageURI(), RDFtriple::URI(RDFS_LABEL, false), RDFtriple::Literal($this->decode_title($pageTitle), NULL, $this->language));
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $CleanSource = $this->remove_wikicode($pageSource);
     $Abstract = $this->extract_abstract($CleanSource);
     //$LongAbstract = $this->extract_abstract($CleanSource, 3000, false, false);
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2000/01/rdf-schema#comment"), RDFtriple::Literal($Abstract, NULL, $this->language));
     return $result;
 }
Example #8
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $pageID = encodeLocalName($pageID);
     // Extract Wikipedia Link
     if (preg_match('/\\{\\{wikipedia\\-c(\\-note)?\\}\\}/', $pageSource)) {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2002/07/owl#sameAs"), RDFtriple::URI("http://dbpedia.org/resource/" . $pageID));
     }
     return $result;
 }
Example #9
0
 public function getPredicateTriples()
 {
     $predicateTriples = new ExtractionResult($this->pageID, $this->language, $this->extractorID);
     foreach ($this->predicates as $subject => $bool) {
         // array_push( $predicateTriples, new RDFtriple($subject, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property"));
         $predicateTriples->addTriple(RDFtriple::URI($subject), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#Property"));
         $predicateTriples->addTriple(RDFtriple::URI($subject), RDFtriple::URI("http://www.w3.org/2000/01/rdf-schema#label"), RDFtriple::Literal($this->getPredicateLabel($subject)));
     }
     return $predicateTriples;
 }
Example #10
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $link = $this->findLink($pageSource);
     if ($link) {
         $this->log(DEBUG, "Found link {$link}");
         $result->addTriple($this->getPageURI(), RDFtriple::URI(FOAF_HOMEPAGE, false), RDFtriple::URI($link, false));
     }
     return $result;
 }
Example #11
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $pagelinks = $this->extract_internal_links($pageSource, $this->language);
     //var_dump($pagelinks);
     foreach ($pagelinks as $LinkURI) {
         $result->addTriple(RDFTriple::page($pageID), RDFTriple::predicate("wikilink"), RDFTriple::page($LinkURI));
     }
     return $result;
 }
Example #12
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $image_ar = $this->extract_image_url($pageSource);
     $image = ucfirst($image_ar[0]);
     $width = $image_ar[1];
     if ($image == null) {
         return $result;
     }
     $ImageURL = $this->make_image_url($image, false, true);
     $ImageURLSmall = $this->make_image_url($image, $width);
     $image = str_replace(" ", "_", trim($image));
     if (!URI::validate($ImageURL) || !URI::validate($ImageURLSmall)) {
         return $result;
     }
     // Add fullsize image
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(FOAF_DEPICTION), RDFtriple::URI($ImageURL));
     // Add depiction has thumbnail
     $result->addTriple(RDFtriple::URI($ImageURL), RDFtriple::URI(FOAF_THUMBNAIL), RDFtriple::URI($ImageURLSmall));
     // Add object has thumbnail
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(DBO_THUMBNAIL), RDFtriple::URI($ImageURLSmall));
     // add triples linking back to the Wikipedia image description
     $image = urlencode($image);
     $wikipediaImageDescription = 'http://' . $this->language . '.wikipedia.org/wiki/Image:' . $image;
     $result->addTriple(RDFtriple::URI($ImageURLSmall), RDFtriple::URI(DC_RIGHTS), RDFtriple::URI($wikipediaImageDescription));
     $result->addTriple(RDFtriple::URI($ImageURL), RDFtriple::URI(DC_RIGHTS), RDFtriple::URI($wikipediaImageDescription));
     return $result;
 }
Example #13
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     // language code in URI uses '-', not '_'
     $language = str_replace('_', '-', $this->language);
     $subject = $this->getPageURI();
     $predicate = $language == 'en' ? RDFtriple::predicate("wikipage-" . $language) : RDFtriple::URI(FOAF_PAGE, false);
     $object = RDFtriple::URI("http://" . $language . ".wikipedia.org/wiki/" . URI::wikipediaEncode($pageTitle));
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $result->addTriple($subject, $predicate, $object);
     return $result;
 }
Example #14
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     // Add fullsize image
     $ImageURL = $this->extract_image_url($pageSource, $pageTitle);
     if ($ImageURL == null || !URI::validate($ImageURL)) {
         return $result;
     }
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/depiction"), RDFtriple::URI($ImageURL));
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $extlinks = $this->extract_external_links($pageSource, $this->language);
     while (list($ExtURL, $ExtName) = each($extlinks)) {
         if (!URI::validate($ExtURL)) {
             continue;
         }
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("reference"), RDFtriple::URI($ExtURL));
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $org_language = $this->language;
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $query = "select page_title, page_namespace, ll.ll_lang,replace(trim(ll_title), ' ', '_')as lang_title from page p inner join langlinks ll on p.page_id = ll.ll_from where p.page_title= '" . mysql_escape_string($pageID) . "' and p.page_namespace <> 14";
     $queryresult = mysql_query($query, $this->DBlink) or die(" search unsuccessful: " . mysql_error());
     while ($row = mysql_fetch_array($queryresult, MYSQL_ASSOC)) {
         //$object="http://xxx.dbpedia.org/resource/".URI::wikipediaEncode($row["lang_title"] );
         $result->addTriple(RDFtriple::URI("http://" . $org_language . ".dbpedia.org/resource/" . URI::wikipediaEncode($pageID)), RDFtriple::URI(OWL_SAMEAS, false), RDFtriple::URI("http://" . $row["ll_lang"] . ".dbpedia.org/resource/" . URI::wikipediaEncode($row["lang_title"])));
     }
     return $result;
 }
Example #17
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if ($this->language == "en") {
         $query = "select wsl.url from templatelinks tl inner join page p on p.page_id = tl.tl_from\r\n\t\t\t\t\t\tinner join dbpedia_develop.wordnet_mapping wm on tl.tl_title = wm.infobox\r\n\t\t\t\t\t\tinner join dbpedia_develop.wordnet_synsets_links wsl on wm.ID1 = wsl.synset30ID\r\n\t\t\t\t\t\twhere p.page_title = '" . mysql_escape_string($pageID) . "' and p.page_namespace = 0";
         $queryresult = mysql_query($query, $this->DBlink) or die("Query failed:\n{$query}\n" . mysql_error());
         while ($row = mysql_fetch_array($queryresult, MYSQL_ASSOC)) {
             $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("wordnet_type"), RDFtriple::URI($row["url"]));
         }
     }
     return $result;
 }
Example #18
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     // Locate geo coordinates
     if (preg_match('/<geo>([\\-0-9\\.]+);([\\-0-9\\.]+)[^0-9]*[^<]*<\\/geo>/', $pageSource, $match)) {
         $lat = $match[1];
         $long = $match[2];
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#lat"), RDFtriple::Literal($lat, "http://www.w3.org/2001/XMLSchema#float", NULL));
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2003/01/geo/wgs84_pos#long"), RDFtriple::Literal($long, "http://www.w3.org/2001/XMLSchema#float", NULL));
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     if (!preg_match("/Category:/", $pageID, $match)) {
         if (preg_match_all("/\\[\\[Category:(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 $Category = preg_replace("/\\|.*/", "", $match[1]);
                 $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2004/02/skos/core#subject"), RDFtriple::page("Category:" . $Category));
             }
         }
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     if (preg_match_all("/Category:(.*)/", $pageID, $match)) {
         $result->addTriple(RDFTriple::page($pageID), RDFTriple::URI("http://www.w3.org/2004/02/skos/core#prefLabel"), RDFTriple::Literal($this->decode_title($pageTitle), NULL, $this->language));
         $result->addTriple(RDFTriple::page($pageID), RDFTriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFTriple::URI("http://www.w3.org/2004/02/skos/core#Concept"));
         if (preg_match_all("/\\[\\[Category:(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 $result->addTriple(RDFTriple::page($pageID), RDFTriple::URI("http://www.w3.org/2004/02/skos/core#broader"), RDFTriple::page("Category:" . $match[1]));
             }
         }
     }
     return $result;
 }
Example #21
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     //create a new Extraction Result to hold all extrated Triples
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     //Look for {{chembox header}} in PageSource
     if (preg_match("/{{chembox header}}/", $pageSource, $match)) {
         //DO SOME PARSING
         //Add a Triple for each Property
         $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_MY_CHEM_PROPERTY, false), RDFtriple::Literal("my_value"));
         //Add each Predicate to the Predicate Collection
         $this->allPredicates->addPredicate("my_chem_property");
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $extlinks = $this->extract_external_links($pageSource, $this->language);
     while (list($ExtURL, $ExtName) = each($extlinks)) {
         // Replace single quotes with %27
         $ExtURL = str_replace("'", "%27", $ExtURL);
         $ExtURL = str_replace("\\", "\\\\", $ExtURL);
         if (!URI::validate($ExtURL)) {
             continue;
         }
         $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_REFERENCE, false), RDFtriple::URI($ExtURL));
     }
     return $result;
 }
Example #23
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $pagelinks = $this->extract_internal_links($pageSource, $this->language);
     //var_dump($pagelinks);
     $pagelinks = array_unique($pagelinks);
     foreach ($pagelinks as $LinkURI) {
         $object = DB_RESOURCE_NS . ucfirst(URI::wikipediaEncode($LinkURI));
         try {
             $object = RDFtriple::URI($object);
         } catch (Exception $e) {
             $this->log('warn', 'Caught exception: ', $e->getMessage(), "\n");
             continue;
         }
         $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_WIKILINK, false), $object);
     }
     return $result;
 }
Example #24
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     global $pagetitle;
     // Needed for Imageextraction in catchObjectDatatype.php (catchLogo());
     $pagetitle = $pageTitle;
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     global $parseResult;
     // Contains the Extraction result
     $parseResult = null;
     parsePage($pageID, $pageSource);
     if (count($parseResult) < 1) {
         return $result;
     }
     $knownProperties = array($parseResult[0][1]);
     foreach ($parseResult as $myTriple) {
         $subject = RDFtriple::URI($myTriple[0]);
         // Rename Properties like LeaderName1, LeaderName2, ... to LeaderName
         if (preg_match("/(.*[^0-9_]+)([0-9])\$/", $myTriple[1], $matches)) {
             $key = array_search($matches[1], $knownProperties);
             if ($key) {
                 $myTriple[1] = $knownProperties[$key];
             } else {
                 array_push($knownProperties, $matches[1]);
             }
             $myTriple[1] = $matches[1];
         } else {
             if (!array_search($myTriple[1], $knownProperties)) {
                 array_push($knownProperties, $myTriple[1]);
             }
         }
         $predicate = RDFtriple::URI($myTriple[1]);
         if ($myTriple[3] == "r") {
             $object = RDFtriple::URI($myTriple[2]);
         } else {
             if ($myTriple[5] == null) {
                 $myTriple[5] = $this->language;
             }
             $object = RDFtriple::literal($myTriple[2], $myTriple[4], $myTriple[5]);
         }
         $result->addTriple($subject, $predicate, $object);
         $this->allPredicates->addPredicate($myTriple[1]);
     }
     return $result;
 }
Example #25
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $foundLinks = array();
     /* Look in infoboxes */
     $infoboxes = $this->getInfoboxes($pageSource);
     foreach ($infoboxes[1] as $box) {
         $boxProperties = $this->getBoxProperties($box, true);
         foreach ($this->knownHomepagePredicates as $pred) {
             $key = strtolower(urldecode($pred));
             if (isset($boxProperties[$key])) {
                 $foundLinks[] = $this->parseURL($boxProperties[$key], true);
                 if (HomepageExtractor::enableDebug) {
                     echo "<h3>Found box property '" . $pred . "'</h3>";
                 }
             }
         }
     }
     /* Process "External links" */
     if (isset($this->externalLinkSections[$this->language])) {
         preg_match('/(==+\\s*' . $this->externalLinkSections[$this->language] . '\\s*==+(?:.(?!==+[^=]+==+))*)/s', $pageSource, $matches);
         preg_match_all('/\\*\\s*([^\\n]*)/', $matches[1], $links);
         $linkDesignationsPattern = '/\\b(' . implode('|', $this->knownLinkDesignations) . ')\\b/i';
         foreach ($links[1] as $link) {
             if (preg_match($linkDesignationsPattern, $link)) {
                 $foundLinks[] = $this->parseURL($link, true);
             }
         }
     }
     $numResults = 0;
     foreach ($foundLinks as $link) {
         if (URI::validate($link)) {
             if (HomepageExtractor::enableDebug) {
                 echo "<h3>Found link {$link}</h3>";
             }
             /* Only process the first result */
             if (++$numResults == 1) {
                 $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/homepage"), RDFtriple::URI($link));
             }
         }
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $res = mysql_query("SELECT page_id from page WHERE page_title='" . mysql_real_escape_string($pageTitle, $this->link) . "' and page_namespace=0 and page_is_redirect=0", $this->link);
     $rows = mysql_fetch_array($res);
     $realID = $rows['page_id'];
     $tempExtractionResult = $this->extractClasses($pageTitle, $realID);
     for ($i = 0; $i < count($tempExtractionResult); $i++) {
         if (isset($tempExtractionResult[$i]['object']) && strlen($tempExtractionResult[$i]['object']) > 0) {
             if (!isset($tempExtractionResult[$i]['datatype']) || strlen($tempExtractionResult[$i]['datatype']) == 0) {
                 $result->addTriple(RDFtriple::URI('http://dbpedia.org/resource/' . urlencode($tempExtractionResult[$i]['subject'])), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI('http://dbpedia.org/resource/' . urlencode($tempExtractionResult[$i]['object'])));
             } else {
                 $result->addTriple(RDFtriple::URI('http://dbpedia.org/resource/' . urlencode($tempExtractionResult[$i]['subject'])), RDFtriple::URI($tempExtractionResult[$i]['predicate']), RDFtriple::literal($tempExtractionResult[$i]['object'], $tempExtractionResult[$i]['datatype']));
             }
         }
     }
     // end for extractionResult
     return $result;
 }
Example #27
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if (Util::isRedirect($pageSource, $this->language)) {
         if (preg_match("/\\[\\[(.*?)\\]\\]/", $pageSource, $matches) === 1) {
             try {
                 $s = $this->getPageURI();
                 $p = RDFtriple::URI(DB_REDIRECT, false);
                 $o = RDFtriple::page($this->getLinkForLabeledLink($matches[1]));
                 $result->addTriple($s, $p, $o);
             } catch (Exception $e) {
                 // exception is thrown when URIs are not valid, in this case we just
                 // do nothing i.e. do not write the triple
                 $this->log(INFO, $o->getURI() . ' is an invalid uri');
             }
         }
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     global $MEDIAWIKI_DISAMBIGUATIONS_EXTENSION;
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if (Util::isDisambiguation($pageSource, $this->language)) {
         // use only links that include the name of the current page and don't include a namespace.
         // Example: http://en.wikipedia.org/wiki/User
         // - we omit [[Wikipedia:Username policy]]
         // - we include [[User (computing)]] and many others
         // - TODO: we should include [[Consumer]], but don't - it doesn't include "user"
         if (isset($MEDIAWIKI_DISAMBIGUATIONS_EXTENSION[$this->language])) {
             foreach ($MEDIAWIKI_DISAMBIGUATIONS_EXTENSION[$this->language] as $disambig) {
                 if (strpos($pageID, $disambig)) {
                     $pageIDClean = str_replace('_(' . $disambig . ')', '', $pageID);
                 }
             }
         } else {
             $pageIDClean = str_replace('_(disambiguation)', '', $pageID);
         }
         if (!isset($pageIDClean)) {
             $pageIDClean = "";
             $warn = "pageidclean not set";
         }
         $regex = '/\\[\\[([^:\\[\\]]*?' . preg_quote($pageIDClean) . '[^\\[\\]]*?)\\]\\]/i';
         if (preg_match_all($regex, $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 $object = DB_RESOURCE_NS . URI::wikipediaEncode($this->getLinkForLabeledLink($match[1]));
                 try {
                     $object = RDFtriple::URI($object);
                 } catch (Exception $e) {
                     $this->log('warn', 'Caught exception: ' . $e->getMessage() . "\n");
                     continue;
                 }
                 $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(DB_DISAMBIGUATES, false), $object);
             }
         }
     }
     if (isset($warn)) {
         $this->log('warn', $warn . " {$pageID} \n");
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     //TODO image namespace
     $text = $pageSource;
     //TODO not sure what to take as magic number here:
     // 4096 was to short, e.g. inappropriate for london
     $text = substr($text, 0, 8192);
     $text = self::stripMarkup($text);
     //TODO REMOVE THIS LINE FOR DEBUGGING:
     $text = $this->_exceptions($text);
     //2 is probalby perfect, since it guarantuees a certain lentgh
     $firstTwoSentences = $this->_extractStart($text, 2);
     //better than nothing
     $fullabstract = $firstTwoSentences;
     //this is crazy code as it could also be 0
     //it is a heuristical approach to nicen the abstracts.
     // take anything until you find  '=='
     if (($pos = strpos($text, '==')) !== false) {
         $fullabstract = trim(substr($text, 0, $pos));
     } else {
         $fullabstract = trim($text);
     }
     if (!empty($firstTwoSentences)) {
         $s = $this->getPageURI();
         $p = RDFtriple::URI(DBCOMM_COMMENT, false);
         $o = RDFtriple::Literal($firstTwoSentences, NULL, $this->language);
         $this->log('debug', 'Found: ' . $s->toString() . " " . $p->toString() . " " . $o->toString());
         $result->addTriple($s, $p, $o);
     }
     if (!empty($fullabstract)) {
         $s = $this->getPageURI();
         $p = RDFtriple::URI(DBCOMM_ABSTRACT, false);
         $o = RDFtriple::Literal($fullabstract, NULL, $this->language);
         $this->log('debug', 'Found: ' . $s->toString() . " " . $p->toString() . " " . $o->toString());
         $result->addTriple($s, $p, $o);
     }
     //TODO $clipped = substr( $extract, 0, 1024 );
     //TODO UtfNormal::cleanUp( $clipped ); in include/normal/UtfNormal
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $category = Util::getMediaWikiNamespace($this->language, MW_CATEGORY_NAMESPACE);
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     if (!preg_match("/" . $category . ":/", $pageID, $match)) {
         if (preg_match_all("/\\[\\[" . $category . ":(.*)\\]\\]/U", $pageSource, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 $Category = preg_replace("/\\|.*/", "", $match[1]);
                 $object = Util::getDBpediaCategoryPrefix($this->language) . URI::wikipediaEncode($Category);
                 try {
                     $object = RDFtriple::URI($object);
                 } catch (Exception $e) {
                     $this->log(WARN, 'Caught exception: ', $e->getMessage(), "\n");
                     continue;
                 }
                 $result->addTriple($this->getPageURI(), RDFtriple::URI(SKOS_SUBJECT, false), $object);
             }
         }
     }
     return $result;
 }