public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $pageID = encodeLocalName($pageID); // Remove Template as this is already extracted by the Infobox Extractor // Find subtemplates and remove Subtemplates, which are listed as ignored! preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $pageSource, $subTemplates); foreach ($subTemplates[0] as $key => $subTemplate) { $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate); // Cut Brackets / {} $pageSource = str_replace('{{' . $subTemplate . '}}', '', $pageSource); } // Extract internal Semantic Links $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-z0-9\\- _]+)(::)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate(encodeLocalName($match[2])), RDFtriple::page($match[4])); } // Extract Literals $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-Z\\-_ ]+)(:=)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $triple = array(); $triple = parseAttributeValue($match[4], $pageID, $match[2]); // object, object_is, datatype(, language) $lexicalForm = $triple[0]; $datatype = $triple[2]; $predicate = propertyToCamelCase(encodeLocalName($match[2])); // Continue if empty String if ($lexicalForm == null) { continue; } $result->addTriple(RDFtriple::page($pageID), RDFTriple::predicate($predicate), RDFtriple::literal($lexicalForm, $datatype, 'en')); } return $result; }
function parseTemplate($subject, $template, $language = NULL) { // If template/subTemplate is listed as ignored, return false if (isIgnored($template, $tplName)) { return false; } // Find subtemplates and remove Subtemplates, which are listed as ignored! preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $template, $subTemplates); foreach ($subTemplates[0] as $key => $subTemplate) { $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate); // Cut Brackets / {} if (isIgnored($subTemplate, $tplName)) { $template = str_replace('{{' . $subTemplate . '}}', '', $template); } } // Replace "|" inside subtemplates with "\\" to avoid splitting them like triples $template = preg_replace_callback("/(\\{{2})([^\\}\\|]+)(\\|)([^\\}]+)(\\}{2})/", 'replaceBarInSubtemplate', $template); $equal = preg_match('~=~', $template); // Gruppe=[[Gruppe-3-Element|3]] ersetzt durch Gruppe=[[Gruppe-3-Element***3]] do { $template = preg_replace('/\\[\\[([^\\]]+)\\|([^\\]]*)\\]\\]/', '[[\\1***\\2]]', $template, -1, $count); } while ($count); $triples = explode('|', $template); if (count($triples) <= $GLOBALS['W2RCFG']['minAttributeCount']) { return false; } $templateName = strtolower(trim(array_shift($triples))); // if(!isBlanknote($subject) && !$GLOBALS['onefile']) // $GLOBALS['filename']=urlencode($templateName).'.'.$GLOBALS['outputFormat']; // Array containing URIs to subtemplates. If the same URI is in use already, add a number to it $knownSubTemplateURI = array(); // subject $s = $subject; $z = 0; foreach ($triples as $triple) { if ($equal) { $split = explode('=', $triple, 2); if (count($split) < 2) { continue; } list($p, $o) = $split; $p = trim($p); } else { $p = "property" . ++$z; $o = $triple; } $o = trim($o); //if property date and object an timespan we extract it with following special case if ($p == "date") { $o = str_replace("[", "", str_replace("]", "", $o)); $o = str_replace("–", "-", $o); } // Do not allow empty Properties if (strlen($p) < 1) { continue; } if (in_array($p, $GLOBALS['W2RCFG']['ignoreProperties'])) { continue; } if ($o !== '' & $o !== NULL) { $pred = $p; // if(!$GLOBALS['templateStatistics'] && $GLOBALS['propertyStat'][$p]['count']<10) //continue; // predicate // Write properties CamelCase, no underscores, no hyphens. If first char is digit, add _ at the beginning $p = propertyToCamelCase($p); // Add prefixProperties if set true in config.inc.php if ($GLOBALS['prefixPropertiesWithTemplateName']) { $p = propertyToCamelCase($templateName) . '_' . $p; } else { if (!$equal) { $p = propertyToCamelCase($templateName . "_" . $p); } } // object $o = str_replace('***', '|', $o); // Remove HTML Markup for whitespaces $o = str_replace(' ', ' ', $o); //remove <ref> Content</ref> //$o = preg_replace('/(<ref>.+?<\/ref>)/s','',$o); // Parse Subtemplates (only parse Subtemplates with values!) if (preg_match_all("/(\\{{2})([^\\}]+)(\\}{2})/", $o, $subTemplates, PREG_SET_ORDER)) { foreach ($subTemplates as $subTemplate) { // Replace #### back to |, in order to parse subtemplate properly $tpl = str_replace("####", "|", $subTemplate[2]); // If subtemplate contains values, the subject is only the first word if (preg_match("/(^[^\\|]+)(\\|)/", $tpl, $match)) { $subTemplateSubject = $subject . '/' . $p . '/' . $match[1]; } else { $subTemplateSubject = $subject . '/' . $p . '/' . $tpl; } // Look up URI in Array containing known URIs, if found add counter to URI. // e.g. http://dbpedia.org/United_Kingdom/footnote/cite_web // ==> http://dbpedia.org/United_Kingdom/footnote/cite_web1 ... if (!isset($knownSubTemplateURI[$subTemplateSubject])) { // array_push( $knownSubTemplateURI, $subTemplateSubject ); $knownSubTemplateURI[$subTemplateSubject] = 0; } else { $knownSubTemplateURI[$subTemplateSubject]++; $subTemplateSubject .= $knownSubTemplateURI[$subTemplateSubject]; } // If subtemplate contained real values, write the corresponding triple if (parseTemplate($subTemplateSubject, $tpl)) { writeTripel($s, $GLOBALS['W2RCFG']['propertyBase'] . $p, $subTemplateSubject, 'main', 'r', null, null); } } } // Remove subTemplates from Strings $o = str_replace("####", "|", $o); $o = preg_replace("/\\{{2}[^\\}]+\\}{2}/", "", $o); // Sometimes only whitespace remain, then continue with next triple if (preg_match("/^[\\s]*\$/", $o)) { continue; } //replace predicate if necessary to make them unambiguous $p = replacePredicate($p); // Add URI prefixes to property names $p = $GLOBALS['W2RCFG']['propertyBase'] . $p; if (isBlanknoteList($o)) { printList($s, $p, $o); } else { list($o, $o_is, $dtype, $lang) = parseAttributeValue($o, $s, $p, $language); // special newline handling $br = array('<br>', '<br/>', '<br />'); if ($o_is == 'l') { $o = str_replace($br, "\n", $o); } else { if ($o_is == 'r') { $o = str_replace($br, '', $o); } } if ($o !== NULL) { writeTripel($s, $p, $o, 'main', $o_is, $dtype, $lang); } } //if($GLOBALS['templateStatistics'] && $o!=NULL && $equal) { // $GLOBALS['propertyStat'][$pred]['count']++; // $GLOBALS['propertyStat'][$pred]['maxCountPerTemplate']=max($GLOBALS['propertyStat'][$pred]['maxCountPerTemplate'],++$pc[$pred]); // if(!$GLOBALS['propertyStat'][$pred]['inTemplates'] || !in_array($templateName,$GLOBALS['propertyStat'][$pred]['inTemplates'])) // $GLOBALS['propertyStat'][$pred]['inTemplates'][]=$templateName; //} $extracted = true; } } if (isset($extracted) && $extracted) { //writeTripel($s,$GLOBALS['W2RCFG']['templateProperty'],$GLOBALS['W2RCFG']['wikipediaBase'].$GLOBALS['templateLabel'].':'.encodeLocalName($templateName),$GLOBALS['filedecisionTemplate']); writeTripel($s, $GLOBALS['W2RCFG']['templateProperty'], $GLOBALS['W2RCFG']['wikipediaBase'] . $GLOBALS['templateLabel'] . ':' . $templateName); //if ($GLOBALS['addExplicitTypeTriples']) // printexplicitTyping($templateName,$GLOBALS['filename'],'t'); } if (isset($extracted)) { return $extracted; } else { return false; } }
/** * This function returns two results: * [triples: the generated triples, metaTriples: meta triples 2: the used templates] * * Unfortunately I haven't renamed the variables yet - result * is the array containing the result TRIPLES!! - its not this * 2 element. * */ private function myGenerate(BreadCrumb $breadCrumb, $value) { // result is the array containing: triples, meta triples, used templates $result = array(array(), array(), array()); $rootSubjectUri = RDFTriple::page($breadCrumb->getRoot()); // this array is only relevant on depth 0 $relatedClasses = array(); //$metaTriples = array(); //$usedTemplateNames = array(); // 'parent' means the parent of the value - thus subject and predicate $parentName = $this->breadCrumbTransformer->transform($breadCrumb); $parentResource = RDFTriple::page($parentName); $parentPropertyName = null; $tmp = $breadCrumb->peekTop(0); if (isset($tmp)) { $parentPropertyName = $tmp->getPropertyName(); } // Get all templates on this site, indexed by name // (there may be multiple templates with the same name) $nameToTemplates = SimpleWikiTemplateMatcher::match($value); //print_r($value); //print_r($nameToTemplates); //echo "NOW COMES THE STORM\n"; foreach ($nameToTemplates as $templateName => $templates) { if (strlen($templateName) < 1) { continue; } //echo "GOT TEMPLATE NAME $templateName\n"; $templateName = $this->mediaWikiUtil->toCanonicalWikiCase($templateName); if (!$this->templateNameFilter->doesAccept($templateName)) { continue; } $templateUri = RDFTriple::URI(DB_TEMPLATE_NS . $templateName, false); $result[2][$templateName] = 1; // Get annotations for the template - if there are any $lookupName = "Template:{$templateName}/doc"; if ($breadCrumb->getDepth() == 0) { $ta = $this->templateDb->getTemplateAnnotation($lookupName); // Create the triples for "relatesToClass" // But only for the page itself (not for sub templates) // if no related class exists, default to rdf:type owl:Thing if (isset($ta)) { foreach ($ta->getRelatedClasses() as $item) { $relatedClasses[$item] = 1; } } } foreach ($templates as $templateIndex => $template) { //echo "GOT TEMPLATE INDEX $templateIndex\n"; // Iterate over all arguments $arguments = $template->getArguments(); foreach ($arguments as $argumentName => $values) { //echo "GOT ARGUMENT NAME $argumentName\n"; // propertyNs defaults to DB_PROPERTY_NS unless there // exists a mapping in the templatedb. In that case it will // be set to DB_ONTOLOGY_NS $propertyNs = DB_PROPERTY_NS; $pa = null; if (isset($ta)) { $pas = $ta->getPropertyAnnotations(); if (array_key_exists($argumentName, $pas)) { $pa = $pas[$argumentName]; $propertyNs = DB_ONTOLOGY_NS; } } //print_r($ta); //echo "PROPERTY NS : $lookupName - $argumentName = $propertyNs\n"; // Fake a property mapping if there was none in the db // This maps argumentName back to iteself if (!isset($pa)) { // If there was no mapping we might ignore it // depending on an option (We can prevent this extractor // to generate triples with properties in the // dbp:property namespace // We allow such triples on subResources though. if ($this->allowUnmappedProperties != true && $breadCrumb->getDepth() == 0) { continue; } // If there was no mapping, also rename numeric // argument names (e.g. 1 becomes property1) // this is just cosmetic for the result if (is_numeric($argumentName)) { $argumentName = "property{$argumentName}"; } $pa = new PropertyAnnotation($argumentName); $pa->addMapping(new PropertyMapping($argumentName)); } foreach ($pa->getMappings() as $pm) { $parseHint = $pm->getParseHint(); //echo "Mapping $argumentName : {$pm->getRenamedValue()}\n\n"; // if the renamed value is not set, use the original // name // otherwise use the mapped value if (!isEmptyString($pm->getRenamedValue())) { $argumentName = $pm->getRenamedValue(); } $argumentName = trim($argumentName); //echo "Mapping $argumentName : {$pm->getRenamedValue()}\n\n"; // Skip empty properties // FIXME does that even happen? if (strlen($argumentName) < 1) { continue; } //echo "TN = $templateName, AN = $argumentName\n"; $childBreadcrumb = $breadCrumb->createClone(); $childBreadcrumb->push(new BreadcrumbNode($templateName, $templateIndex, $argumentName)); //$templateChildName = $this->breadcrumbToSubject($childBreadcrumb); $templateChildName = $this->breadCrumbTransformer->transform($childBreadcrumb); // If there is no parse hint we might be able to derive it if (!isset($parseHint)) { $parseHint = $this->deriveParseHintFromName($argumentName); } // Attempt to obtain a triple generator $tripleGenerator = $this->getTripleGenerator($parseHint); // If we DONT have a triple generator // we fall through to default handling $localResult = array(array(), array(), array()); if (isset($tripleGenerator)) { foreach ($values as $valueIndex => $value) { //echo "GOT VALUE $value\n"; $value = trim($value); // Skip empty values if ($value == "") { continue; } //echo "PROCESSING $templateChildName - $argumentName $value\n"; $tmp = $tripleGenerator->generate($templateChildName, $argumentName, $value); $localResult[0] = array_merge($localResult[0], $tmp); //echo "LOCALRESULT\n"; //print_r($localResult[0]); //print_r($triples); //echo "\nSigh\n"; //if(isset($triples)) // $result = array_merge($result, $triples); } // append the generated triples //continue; } else { // No parse hint - default handling // if property date and object an timespan // we extract it with following special case $argumentName = propertyToCamelCase($argumentName); $argumentName = encodeLocalName($argumentName); if (in_array($argumentName, $GLOBALS['W2RCFG']['ignoreProperties'])) { continue; } // turn the argument name into a property name $propertyName = $propertyNs . $argumentName; foreach ($values as $valueIndex => $value) { $value = trim($value); // Skip empty values if ($value == "") { continue; } if ($argumentName == "date") { $value = str_replace("[", "", $value); $value = str_replace("]", "", $value); $value = str_replace("–", "-", $value); } // Parse out sub templates // if something was extracted: // .) connect subject with subsubject // .) indicate usage at wikipage $subResources = $this->myGenerate($childBreadcrumb, $value); for ($i = 0; $i < 3; ++$i) { $localResult[$i] = array_merge($localResult[$i], $subResources[$i]); } //$result = array_merge($result, $triples); //echo "GOT OBJECT $value\n"; $localResult[0] = array_merge($localResult[0], parseAttributeValueWrapper($value, $templateChildName, $propertyName, $this->language)); //$result = array_merge($result, $triples); } } // For each triple add the ExtractedFromTemplate-Annotation // Exclude triples with wikiPageUsesTemplate as predicate though foreach ($localResult[0] as $triple) { $triple->addExtractedFromTemplateAnnotation($templateUri); } // Add on delete cascade annotation if ($breadCrumb->getDepth() > 1) { foreach ($localResult[0] as $triple) { $triple->addOnDeleteCascadeAnnotation($rootSubjectUri); } } // merge the results //for($i = 0; $i < 3; ++$i) // $result[$i] = array_merge($result[$i], $localResult[$i]); //} for ($i = 0; $i < 3; ++$i) { $result[$i] = array_merge($result[$i], $localResult[$i]); } } } /* How to connect a sub-subject to the root subject? if($breadCrumb->getDepth() == 0) continue; // Create the parent-child connection $parentChildTriple = new RDFtriple( $parentResource, RDFtriple::URI(DB_PROPERTY_NS . encodeLocalName($parentPropertyName), false), RDFtriple::page($templateChildName)); //$result[1][] = $parentChildTriple; */ } } if (count($relatedClasses) > 0) { foreach ($relatedClasses as $relatedClass => $dummy) { $result[1][] = new RDFtriple($parentResource, RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(DB_ONTOLOGY_NS . $relatedClass, false)); } } else { if ($breadCrumb->getDepth() == 0) { $result[1][] = new RDFtriple($parentResource, RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(OWL_THING, false)); } } // Add the wiki page uses template triples - but only on depth 0 if ($breadCrumb->getDepth() == 0) { foreach ($result[2] as $name => $dummy) { $result[1][] = new RDFTriple($parentResource, self::$wikiPageUsesTemplateUri, RDFTriple::URI(DB_TEMPLATE_NS . $name, false)); } } $n = count($result[0]) + count($result[1]); $this->log(TRACE, "Generated a total of {$n} triples at {$breadCrumb}"); foreach ($result[0] as $item) { $this->log(TRACE, $item); } foreach ($result[1] as $item) { $this->log(TRACE, $item); } return $result; }
if (in_array($p, $GLOBALS['W2RCFG']['ignoreProperties'])) { continue; } if ($o !== '' & $o !== NULL) { $pred = $p; // if(!$GLOBALS['templateStatistics'] && $GLOBALS['propertyStat'][$p]['count']<10) //continue; // predicate // Write properties CamelCase, no underscores, no hyphens. If first char is digit, add _ at the beginning $p = propertyToCamelCase($p); // Add prefixProperties if set true in config.inc.php if ($GLOBALS['prefixPropertiesWithTemplateName']) { $p = propertyToCamelCase($templateName) . '_' . $p; } else { if (!$equal) { $p = propertyToCamelCase($templateName . "_" . $p); } } // object $o = str_replace('***', '|', $o); // Remove HTML Markup for whitespaces $o = str_replace(' ', ' ', $o); //remove <ref> Content</ref> //$o = preg_replace('/(<ref>.+?<\/ref>)/s','',$o); // Parse Subtemplates (only parse Subtemplates with values!) if (preg_match_all("/(\\{{2})([^\\}]+)(\\}{2})/", $o, $subTemplates, PREG_SET_ORDER)) { foreach ($subTemplates as $subTemplate) { // Replace #### back to |, in order to parse subtemplate properly $tpl = str_replace("####", "|", $subTemplate[2]); // If subtemplate contains values, the subject is only the first word if (preg_match("/(^[^\\|]+)(\\|)/", $tpl, $match)) {