public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $pageID = encodeLocalName($pageID); // Remove Template as this is already extracted by the Infobox Extractor // Find subtemplates and remove Subtemplates, which are listed as ignored! preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $pageSource, $subTemplates); foreach ($subTemplates[0] as $key => $subTemplate) { $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate); // Cut Brackets / {} $pageSource = str_replace('{{' . $subTemplate . '}}', '', $pageSource); } // Extract internal Semantic Links $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-z0-9\\- _]+)(::)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate(encodeLocalName($match[2])), RDFtriple::page($match[4])); } // Extract Literals $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-Z\\-_ ]+)(:=)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $triple = array(); $triple = parseAttributeValue($match[4], $pageID, $match[2]); // object, object_is, datatype(, language) $lexicalForm = $triple[0]; $datatype = $triple[2]; $predicate = propertyToCamelCase(encodeLocalName($match[2])); // Continue if empty String if ($lexicalForm == null) { continue; } $result->addTriple(RDFtriple::page($pageID), RDFTriple::predicate($predicate), RDFtriple::literal($lexicalForm, $datatype, 'en')); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { global $pagetitle; // Needed for Imageextraction in catchObjectDatatype.php (catchLogo()); $pagetitle = $pageTitle; $result = new ExtractionResult($pageID, $this->language, self::extractorID); global $parseResult; // Contains the Extraction result $parseResult = null; parsePage($pageID, $pageSource); if (count($parseResult) < 1) { return $result; } $knownProperties = array($parseResult[0][1]); foreach ($parseResult as $myTriple) { $subject = RDFtriple::URI($myTriple[0]); // Rename Properties like LeaderName1, LeaderName2, ... to LeaderName if (preg_match("/(.*[^0-9_]+)([0-9])\$/", $myTriple[1], $matches)) { $key = array_search($matches[1], $knownProperties); if ($key) { $myTriple[1] = $knownProperties[$key]; } else { array_push($knownProperties, $matches[1]); } $myTriple[1] = $matches[1]; } else { if (!array_search($myTriple[1], $knownProperties)) { array_push($knownProperties, $myTriple[1]); } } $predicate = RDFtriple::URI($myTriple[1]); if ($myTriple[3] == "r") { $object = RDFtriple::URI($myTriple[2]); } else { if ($myTriple[5] == null) { $myTriple[5] = $this->language; } $object = RDFtriple::literal($myTriple[2], $myTriple[4], $myTriple[5]); } $result->addTriple($subject, $predicate, $object); $this->allPredicates->addPredicate($myTriple[1]); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $res = mysql_query("SELECT page_id from page WHERE page_title='" . mysql_real_escape_string($pageTitle, $this->link) . "' and page_namespace=0 and page_is_redirect=0", $this->link); $rows = mysql_fetch_array($res); $realID = $rows['page_id']; $tempExtractionResult = $this->extractClasses($pageTitle, $realID); for ($i = 0; $i < count($tempExtractionResult); $i++) { if (isset($tempExtractionResult[$i]['object']) && strlen($tempExtractionResult[$i]['object']) > 0) { if (!isset($tempExtractionResult[$i]['datatype']) || strlen($tempExtractionResult[$i]['datatype']) == 0) { $result->addTriple(RDFtriple::URI('http://dbpedia.org/resource/' . urlencode($tempExtractionResult[$i]['subject'])), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI('http://dbpedia.org/resource/' . urlencode($tempExtractionResult[$i]['object']))); } else { $result->addTriple(RDFtriple::URI('http://dbpedia.org/resource/' . urlencode($tempExtractionResult[$i]['subject'])), RDFtriple::URI($tempExtractionResult[$i]['predicate']), RDFtriple::literal($tempExtractionResult[$i]['object'], $tempExtractionResult[$i]['datatype'])); } } } // end for extractionResult return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_CHARACTERCOUNT, false), RDFtriple::literal("" . strlen($pageSource) . "", XS_INTEGER)); return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("characterCount"), RDFtriple::literal(strlen($pageSource))); return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); global $parseResult; // Contains the Extraction result $parseResult = null; $this->parsePage($pageID, $pageSource, $this->language); if (count($parseResult) < 1) { return $result; } $knownProperties = array($parseResult[0][1]); foreach ($parseResult as $myTriple) { try { $subject = RDFtriple::URI($myTriple[0]); } catch (Exception $e) { echo 'Caught exception: ', $e->getMessage(), "\n"; continue; } // Rename Properties like LeaderName1, LeaderName2, ... to LeaderName if (preg_match("/(.*[^0-9_]+)([0-9])\$/", $myTriple[1], $matches)) { // if property consist of letters from another writing system then latin, e.g. Korean, // the words are decoded as e.g. _percent_B1, this must not be changed // if language.use_percent_encoding = false, it looks like e.g. %B1 if (substr(substr($myTriple[1], -11), 0, 9) != "_percent_" && !ereg("%([A-F0-9]{2})", substr($myTriple[1], -3))) { $key = array_search($matches[1], $knownProperties); if ($key) { $myTriple[1] = $knownProperties[$key]; } else { array_push($knownProperties, $matches[1]); $myTriple[1] = $matches[1]; } } } else { if (!array_search($myTriple[1], $knownProperties)) { array_push($knownProperties, $myTriple[1]); } } // if a property is longer than the maximum configured length, we do // do not write the triple if (strlen($myTriple[1]) > $GLOBALS['W2RCFG']['maximumPropertyLength']) { continue; } try { $predicate = RDFtriple::URI($myTriple[1]); } catch (Exception $e) { echo 'Caught exception: ', $e->getMessage(), "\n"; continue; } if ($myTriple[3] == "r") { try { $object = RDFtriple::URI($myTriple[2]); } catch (Exception $e) { echo 'Caught exception: ', $e->getMessage(), "\n"; continue; } } else { if ($myTriple[5] == null) { $myTriple[5] = $this->language; } $object = RDFtriple::literal($myTriple[2], $myTriple[4], $myTriple[5]); } //this is for the db:London/rating //subtemplate problem $triple = new RDFtriple($subject, $predicate, $object); $currentSubject = RDFtriple::page($pageID); $small = $currentSubject->getURI(); $big = $subject->getURI(); if (strpos($big, $small) === 0 && strlen($big) > strlen($small)) { $triple->addOnDeleteCascadeAnnotation($currentSubject); } $result->addTripleObject($triple); $this->allPredicates->addPredicate($myTriple[1]); } return $result; }