public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); if (preg_match_all("/Category:(.*)/", $pageID, $match)) { $result->addTriple(RDFTriple::page($pageID), RDFTriple::URI("http://www.w3.org/2004/02/skos/core#prefLabel"), RDFTriple::Literal($this->decode_title($pageTitle), NULL, $this->language)); $result->addTriple(RDFTriple::page($pageID), RDFTriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFTriple::URI("http://www.w3.org/2004/02/skos/core#Concept")); if (preg_match_all("/\\[\\[Category:(.*)\\]\\]/", $pageSource, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { $result->addTriple(RDFTriple::page($pageID), RDFTriple::URI("http://www.w3.org/2004/02/skos/core#broader"), RDFTriple::page("Category:" . $match[1])); } } } return $result; }
/** * This function returns two results: * [triples: the generated triples, metaTriples: meta triples 2: the used templates] * * Unfortunately I haven't renamed the variables yet - result * is the array containing the result TRIPLES!! - its not this * 2 element. * */ private function myGenerate(BreadCrumb $breadCrumb, $value) { // result is the array containing: triples, meta triples, used templates $result = array(array(), array(), array()); $rootSubjectUri = RDFTriple::page($breadCrumb->getRoot()); // this array is only relevant on depth 0 $relatedClasses = array(); //$metaTriples = array(); //$usedTemplateNames = array(); // 'parent' means the parent of the value - thus subject and predicate $parentName = $this->breadCrumbTransformer->transform($breadCrumb); $parentResource = RDFTriple::page($parentName); $parentPropertyName = null; $tmp = $breadCrumb->peekTop(0); if (isset($tmp)) { $parentPropertyName = $tmp->getPropertyName(); } // Get all templates on this site, indexed by name // (there may be multiple templates with the same name) $nameToTemplates = SimpleWikiTemplateMatcher::match($value); //print_r($value); //print_r($nameToTemplates); //echo "NOW COMES THE STORM\n"; foreach ($nameToTemplates as $templateName => $templates) { if (strlen($templateName) < 1) { continue; } //echo "GOT TEMPLATE NAME $templateName\n"; $templateName = $this->mediaWikiUtil->toCanonicalWikiCase($templateName); if (!$this->templateNameFilter->doesAccept($templateName)) { continue; } $templateUri = RDFTriple::URI(DB_TEMPLATE_NS . $templateName, false); $result[2][$templateName] = 1; // Get annotations for the template - if there are any $lookupName = "Template:{$templateName}/doc"; if ($breadCrumb->getDepth() == 0) { $ta = $this->templateDb->getTemplateAnnotation($lookupName); // Create the triples for "relatesToClass" // But only for the page itself (not for sub templates) // if no related class exists, default to rdf:type owl:Thing if (isset($ta)) { foreach ($ta->getRelatedClasses() as $item) { $relatedClasses[$item] = 1; } } } foreach ($templates as $templateIndex => $template) { //echo "GOT TEMPLATE INDEX $templateIndex\n"; // Iterate over all arguments $arguments = $template->getArguments(); foreach ($arguments as $argumentName => $values) { //echo "GOT ARGUMENT NAME $argumentName\n"; // propertyNs defaults to DB_PROPERTY_NS unless there // exists a mapping in the templatedb. In that case it will // be set to DB_ONTOLOGY_NS $propertyNs = DB_PROPERTY_NS; $pa = null; if (isset($ta)) { $pas = $ta->getPropertyAnnotations(); if (array_key_exists($argumentName, $pas)) { $pa = $pas[$argumentName]; $propertyNs = DB_ONTOLOGY_NS; } } //print_r($ta); //echo "PROPERTY NS : $lookupName - $argumentName = $propertyNs\n"; // Fake a property mapping if there was none in the db // This maps argumentName back to iteself if (!isset($pa)) { // If there was no mapping we might ignore it // depending on an option (We can prevent this extractor // to generate triples with properties in the // dbp:property namespace // We allow such triples on subResources though. if ($this->allowUnmappedProperties != true && $breadCrumb->getDepth() == 0) { continue; } // If there was no mapping, also rename numeric // argument names (e.g. 1 becomes property1) // this is just cosmetic for the result if (is_numeric($argumentName)) { $argumentName = "property{$argumentName}"; } $pa = new PropertyAnnotation($argumentName); $pa->addMapping(new PropertyMapping($argumentName)); } foreach ($pa->getMappings() as $pm) { $parseHint = $pm->getParseHint(); //echo "Mapping $argumentName : {$pm->getRenamedValue()}\n\n"; // if the renamed value is not set, use the original // name // otherwise use the mapped value if (!isEmptyString($pm->getRenamedValue())) { $argumentName = $pm->getRenamedValue(); } $argumentName = trim($argumentName); //echo "Mapping $argumentName : {$pm->getRenamedValue()}\n\n"; // Skip empty properties // FIXME does that even happen? if (strlen($argumentName) < 1) { continue; } //echo "TN = $templateName, AN = $argumentName\n"; $childBreadcrumb = $breadCrumb->createClone(); $childBreadcrumb->push(new BreadcrumbNode($templateName, $templateIndex, $argumentName)); //$templateChildName = $this->breadcrumbToSubject($childBreadcrumb); $templateChildName = $this->breadCrumbTransformer->transform($childBreadcrumb); // If there is no parse hint we might be able to derive it if (!isset($parseHint)) { $parseHint = $this->deriveParseHintFromName($argumentName); } // Attempt to obtain a triple generator $tripleGenerator = $this->getTripleGenerator($parseHint); // If we DONT have a triple generator // we fall through to default handling $localResult = array(array(), array(), array()); if (isset($tripleGenerator)) { foreach ($values as $valueIndex => $value) { //echo "GOT VALUE $value\n"; $value = trim($value); // Skip empty values if ($value == "") { continue; } //echo "PROCESSING $templateChildName - $argumentName $value\n"; $tmp = $tripleGenerator->generate($templateChildName, $argumentName, $value); $localResult[0] = array_merge($localResult[0], $tmp); //echo "LOCALRESULT\n"; //print_r($localResult[0]); //print_r($triples); //echo "\nSigh\n"; //if(isset($triples)) // $result = array_merge($result, $triples); } // append the generated triples //continue; } else { // No parse hint - default handling // if property date and object an timespan // we extract it with following special case $argumentName = propertyToCamelCase($argumentName); $argumentName = encodeLocalName($argumentName); if (in_array($argumentName, $GLOBALS['W2RCFG']['ignoreProperties'])) { continue; } // turn the argument name into a property name $propertyName = $propertyNs . $argumentName; foreach ($values as $valueIndex => $value) { $value = trim($value); // Skip empty values if ($value == "") { continue; } if ($argumentName == "date") { $value = str_replace("[", "", $value); $value = str_replace("]", "", $value); $value = str_replace("–", "-", $value); } // Parse out sub templates // if something was extracted: // .) connect subject with subsubject // .) indicate usage at wikipage $subResources = $this->myGenerate($childBreadcrumb, $value); for ($i = 0; $i < 3; ++$i) { $localResult[$i] = array_merge($localResult[$i], $subResources[$i]); } //$result = array_merge($result, $triples); //echo "GOT OBJECT $value\n"; $localResult[0] = array_merge($localResult[0], parseAttributeValueWrapper($value, $templateChildName, $propertyName, $this->language)); //$result = array_merge($result, $triples); } } // For each triple add the ExtractedFromTemplate-Annotation // Exclude triples with wikiPageUsesTemplate as predicate though foreach ($localResult[0] as $triple) { $triple->addExtractedFromTemplateAnnotation($templateUri); } // Add on delete cascade annotation if ($breadCrumb->getDepth() > 1) { foreach ($localResult[0] as $triple) { $triple->addOnDeleteCascadeAnnotation($rootSubjectUri); } } // merge the results //for($i = 0; $i < 3; ++$i) // $result[$i] = array_merge($result[$i], $localResult[$i]); //} for ($i = 0; $i < 3; ++$i) { $result[$i] = array_merge($result[$i], $localResult[$i]); } } } /* How to connect a sub-subject to the root subject? if($breadCrumb->getDepth() == 0) continue; // Create the parent-child connection $parentChildTriple = new RDFtriple( $parentResource, RDFtriple::URI(DB_PROPERTY_NS . encodeLocalName($parentPropertyName), false), RDFtriple::page($templateChildName)); //$result[1][] = $parentChildTriple; */ } } if (count($relatedClasses) > 0) { foreach ($relatedClasses as $relatedClass => $dummy) { $result[1][] = new RDFtriple($parentResource, RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(DB_ONTOLOGY_NS . $relatedClass, false)); } } else { if ($breadCrumb->getDepth() == 0) { $result[1][] = new RDFtriple($parentResource, RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(OWL_THING, false)); } } // Add the wiki page uses template triples - but only on depth 0 if ($breadCrumb->getDepth() == 0) { foreach ($result[2] as $name => $dummy) { $result[1][] = new RDFTriple($parentResource, self::$wikiPageUsesTemplateUri, RDFTriple::URI(DB_TEMPLATE_NS . $name, false)); } } $n = count($result[0]) + count($result[1]); $this->log(TRACE, "Generated a total of {$n} triples at {$breadCrumb}"); foreach ($result[0] as $item) { $this->log(TRACE, $item); } foreach ($result[1] as $item) { $this->log(TRACE, $item); } return $result; }
/** * starts the extraction * */ public function extractClasses() { include "./extractors/infobox/config.inc.php"; include "./databaseconfig.php"; echo "Start der Klassenberechnung... Zu bearbeitende Klassen (isClass=1):" . mysql_num_rows($this->result) . "\n"; while ($row = mysql_fetch_assoc($this->result)) { if (isset($outputcounter)) { $outputcounter++; } else { $outputcounter = 0; } if ($outputcounter % 1000 == 0) { // write the extracted triples => commented out, because it seems to cause problems // $this->destination->accept($this->extractionResult,1); // $this->extractionResult->clear(); echo "1000 Klassen fertig bearbeitet, gesamt: {$outputcounter}\n"; } $t_resultarray = array(""); $pageTitle = $row['page_title']; $pageId = $row['page_id']; // definieren irgendwas ist eine Klasse $this->extractionResult->addTriple(RDFtriple::URI($GLOBALS['W2RCFG']['wikipediaBase'] . urlencode($pageTitle)), RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDFtriple::URI("http://www.w3.org/2002/07/owl#Class")); $tempExtractionResult = $this->get_superclass($pageId, $pageTitle, $this->link, 0, 10); $tempExtractionResult = array_unique($tempExtractionResult); #print_r($tempExtractionResult); foreach ($tempExtractionResult as $key => $value) { $t_resultarray[] = $value; } $tempExtractionResult = $t_resultarray; for ($i = 0; $i < count($tempExtractionResult); $i++) { if (strlen($tempExtractionResult[$i]) > 0) { $this->extractionResult->addTriple(RDFtriple::URI($GLOBALS['W2RCFG']['wikipediaBase'] . urlencode($pageTitle)), RDFtriple::URI("http://www.w3.org/2000/01/rdf-schema#subClassOf"), RDFTriple::URI($GLOBALS['W2RCFG']['wikipediaBase'] . urlencode($tempExtractionResult[$i]))); } } } /* echo "Start der Artikelzuordnung zu Klassen...\n"; if ($this->writeClassInstances==true) { $articlesToClasses=new ArticlesToClasses($this->tempTableName); $articlesToClasses->setLink($this->link); $tempExtractionResult=$articlesToClasses->extractClasses(); //print_r($tempExtractionResult); for ($i=0;$i<count($tempExtractionResult);$i++) { if (strlen($tempExtractionResult[$i]['object'])>0) { if (strlen($tempExtractionResult[$i]['datatype'])==0) { $this->extractionResult->addTriple(RDFtriple::URI($GLOBALS['W2RCFG']['wikipediaBase'].urlencode($tempExtractionResult[$i]['subject'])),RDFtriple::URI("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"),RDFtriple::URI($GLOBALS['W2RCFG']['wikipediaBase'].urlencode($tempExtractionResult[$i]['object']))); } else { $this->extractionResult->addTriple(RDFtriple::URI($GLOBALS['W2RCFG']['wikipediaBase'].urlencode($tempExtractionResult[$i]['subject'])),RDFtriple::predicate($tempExtractionResult[$i]['predicate']),RDFtriple::literal($tempExtractionResult[$i]['object'],$tempExtractionResult[$i]['datatype'])); } } if ($i%1000==0) { $this->destination->accept($this->extractionResult,1); $this->extractionResult->clear(); echo "1000 Zuordnungen geschrieben, gesamt: $i\n"; } } // end for extractionResult } // end writeClassInstances=true */ }