Beispiel #1
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $pageID = encodeLocalName($pageID);
     // Remove Template as this is already extracted by the Infobox Extractor
     // Find subtemplates and remove Subtemplates, which are listed as ignored!
     preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $pageSource, $subTemplates);
     foreach ($subTemplates[0] as $key => $subTemplate) {
         $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate);
         // Cut Brackets / {}
         $pageSource = str_replace('{{' . $subTemplate . '}}', '', $pageSource);
     }
     // Extract internal Semantic Links
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-z0-9\\- _]+)(::)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate(encodeLocalName($match[2])), RDFtriple::page($match[4]));
     }
     // Extract Literals
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-Z\\-_ ]+)(:=)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $triple = array();
         $triple = parseAttributeValue($match[4], $pageID, $match[2]);
         // object, object_is, datatype(, language)
         $lexicalForm = $triple[0];
         $datatype = $triple[2];
         $predicate = propertyToCamelCase(encodeLocalName($match[2]));
         // Continue if empty String
         if ($lexicalForm == null) {
             continue;
         }
         $result->addTriple(RDFtriple::page($pageID), RDFTriple::predicate($predicate), RDFtriple::literal($lexicalForm, $datatype, 'en'));
     }
     return $result;
 }
Beispiel #2
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     global $pagetitle;
     // Needed for Imageextraction in catchObjectDatatype.php (catchLogo());
     $pagetitle = $pageTitle;
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     global $parseResult;
     // Contains the Extraction result
     $parseResult = null;
     parsePage($pageID, $pageSource);
     if (count($parseResult) < 1) {
         return $result;
     }
     $knownProperties = array($parseResult[0][1]);
     foreach ($parseResult as $myTriple) {
         $subject = RDFtriple::URI($myTriple[0]);
         // Rename Properties like LeaderName1, LeaderName2, ... to LeaderName
         if (preg_match("/(.*[^0-9_]+)([0-9])\$/", $myTriple[1], $matches)) {
             $key = array_search($matches[1], $knownProperties);
             if ($key) {
                 $myTriple[1] = $knownProperties[$key];
             } else {
                 array_push($knownProperties, $matches[1]);
             }
             $myTriple[1] = $matches[1];
         } else {
             if (!array_search($myTriple[1], $knownProperties)) {
                 array_push($knownProperties, $myTriple[1]);
             }
         }
         $predicate = RDFtriple::URI($myTriple[1]);
         if ($myTriple[3] == "r") {
             $object = RDFtriple::URI($myTriple[2]);
         } else {
             if ($myTriple[5] == null) {
                 $myTriple[5] = $this->language;
             }
             $object = RDFtriple::literal($myTriple[2], $myTriple[4], $myTriple[5]);
         }
         $result->addTriple($subject, $predicate, $object);
         $this->allPredicates->addPredicate($myTriple[1]);
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $res = mysql_query("SELECT page_id from page WHERE page_title='" . mysql_real_escape_string($pageTitle, $this->link) . "' and page_namespace=0 and page_is_redirect=0", $this->link);
     $rows = mysql_fetch_array($res);
     $realID = $rows['page_id'];
     $tempExtractionResult = $this->extractClasses($pageTitle, $realID);
     for ($i = 0; $i < count($tempExtractionResult); $i++) {
         if (isset($tempExtractionResult[$i]['object']) && strlen($tempExtractionResult[$i]['object']) > 0) {
             if (!isset($tempExtractionResult[$i]['datatype']) || strlen($tempExtractionResult[$i]['datatype']) == 0) {
                 $result->addTriple(RDFtriple::URI('http://dbpedia.org/resource/' . urlencode($tempExtractionResult[$i]['subject'])), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI('http://dbpedia.org/resource/' . urlencode($tempExtractionResult[$i]['object'])));
             } else {
                 $result->addTriple(RDFtriple::URI('http://dbpedia.org/resource/' . urlencode($tempExtractionResult[$i]['subject'])), RDFtriple::URI($tempExtractionResult[$i]['predicate']), RDFtriple::literal($tempExtractionResult[$i]['object'], $tempExtractionResult[$i]['datatype']));
             }
         }
     }
     // end for extractionResult
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_CHARACTERCOUNT, false), RDFtriple::literal("" . strlen($pageSource) . "", XS_INTEGER));
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("characterCount"), RDFtriple::literal(strlen($pageSource)));
     return $result;
 }
Beispiel #6
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     global $parseResult;
     // Contains the Extraction result
     $parseResult = null;
     $this->parsePage($pageID, $pageSource, $this->language);
     if (count($parseResult) < 1) {
         return $result;
     }
     $knownProperties = array($parseResult[0][1]);
     foreach ($parseResult as $myTriple) {
         try {
             $subject = RDFtriple::URI($myTriple[0]);
         } catch (Exception $e) {
             echo 'Caught exception: ', $e->getMessage(), "\n";
             continue;
         }
         // Rename Properties like LeaderName1, LeaderName2, ... to LeaderName
         if (preg_match("/(.*[^0-9_]+)([0-9])\$/", $myTriple[1], $matches)) {
             // if property consist of letters from another writing system then latin, e.g. Korean,
             // the words are decoded as e.g. _percent_B1, this must not be changed
             // if language.use_percent_encoding = false, it looks like e.g. %B1
             if (substr(substr($myTriple[1], -11), 0, 9) != "_percent_" && !ereg("%([A-F0-9]{2})", substr($myTriple[1], -3))) {
                 $key = array_search($matches[1], $knownProperties);
                 if ($key) {
                     $myTriple[1] = $knownProperties[$key];
                 } else {
                     array_push($knownProperties, $matches[1]);
                     $myTriple[1] = $matches[1];
                 }
             }
         } else {
             if (!array_search($myTriple[1], $knownProperties)) {
                 array_push($knownProperties, $myTriple[1]);
             }
         }
         // if a property is longer than the maximum configured length, we do
         // do not write the triple
         if (strlen($myTriple[1]) > $GLOBALS['W2RCFG']['maximumPropertyLength']) {
             continue;
         }
         try {
             $predicate = RDFtriple::URI($myTriple[1]);
         } catch (Exception $e) {
             echo 'Caught exception: ', $e->getMessage(), "\n";
             continue;
         }
         if ($myTriple[3] == "r") {
             try {
                 $object = RDFtriple::URI($myTriple[2]);
             } catch (Exception $e) {
                 echo 'Caught exception: ', $e->getMessage(), "\n";
                 continue;
             }
         } else {
             if ($myTriple[5] == null) {
                 $myTriple[5] = $this->language;
             }
             $object = RDFtriple::literal($myTriple[2], $myTriple[4], $myTriple[5]);
         }
         //this is for the db:London/rating
         //subtemplate problem
         $triple = new RDFtriple($subject, $predicate, $object);
         $currentSubject = RDFtriple::page($pageID);
         $small = $currentSubject->getURI();
         $big = $subject->getURI();
         if (strpos($big, $small) === 0 && strlen($big) > strlen($small)) {
             $triple->addOnDeleteCascadeAnnotation($currentSubject);
         }
         $result->addTripleObject($triple);
         $this->allPredicates->addPredicate($myTriple[1]);
     }
     return $result;
 }