Exemplo n.º 1
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $pageID = encodeLocalName($pageID);
     // Remove Template as this is already extracted by the Infobox Extractor
     // Find subtemplates and remove Subtemplates, which are listed as ignored!
     preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $pageSource, $subTemplates);
     foreach ($subTemplates[0] as $key => $subTemplate) {
         $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate);
         // Cut Brackets / {}
         $pageSource = str_replace('{{' . $subTemplate . '}}', '', $pageSource);
     }
     // Extract internal Semantic Links
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-z0-9\\- _]+)(::)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate(encodeLocalName($match[2])), RDFtriple::page($match[4]));
     }
     // Extract Literals
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-Z\\-_ ]+)(:=)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $triple = array();
         $triple = parseAttributeValue($match[4], $pageID, $match[2]);
         // object, object_is, datatype(, language)
         $lexicalForm = $triple[0];
         $datatype = $triple[2];
         $predicate = propertyToCamelCase(encodeLocalName($match[2]));
         // Continue if empty String
         if ($lexicalForm == null) {
             continue;
         }
         $result->addTriple(RDFtriple::page($pageID), RDFTriple::predicate($predicate), RDFtriple::literal($lexicalForm, $datatype, 'en'));
     }
     return $result;
 }
Exemplo n.º 2
0
function parseTemplate($subject, $template, $language = NULL)
{
    // If template/subTemplate is listed as ignored, return false
    if (isIgnored($template, $tplName)) {
        return false;
    }
    // Find subtemplates and remove Subtemplates, which are listed as ignored!
    preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $template, $subTemplates);
    foreach ($subTemplates[0] as $key => $subTemplate) {
        $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate);
        // Cut Brackets / {}
        if (isIgnored($subTemplate, $tplName)) {
            $template = str_replace('{{' . $subTemplate . '}}', '', $template);
        }
    }
    // Replace "|" inside subtemplates with "\\" to avoid splitting them like triples
    $template = preg_replace_callback("/(\\{{2})([^\\}\\|]+)(\\|)([^\\}]+)(\\}{2})/", 'replaceBarInSubtemplate', $template);
    $equal = preg_match('~=~', $template);
    // Gruppe=[[Gruppe-3-Element|3]]  ersetzt durch Gruppe=[[Gruppe-3-Element***3]]
    do {
        $template = preg_replace('/\\[\\[([^\\]]+)\\|([^\\]]*)\\]\\]/', '[[\\1***\\2]]', $template, -1, $count);
    } while ($count);
    $triples = explode('|', $template);
    if (count($triples) <= $GLOBALS['W2RCFG']['minAttributeCount']) {
        return false;
    }
    $templateName = strtolower(trim(array_shift($triples)));
    //	if(!isBlanknote($subject) && !$GLOBALS['onefile'])
    //		$GLOBALS['filename']=urlencode($templateName).'.'.$GLOBALS['outputFormat'];
    // Array containing URIs to subtemplates. If the same URI is in use already, add a number to it
    $knownSubTemplateURI = array();
    // subject
    $s = $subject;
    $z = 0;
    foreach ($triples as $triple) {
        if ($equal) {
            $split = explode('=', $triple, 2);
            if (count($split) < 2) {
                continue;
            }
            list($p, $o) = $split;
            $p = trim($p);
        } else {
            $p = "property" . ++$z;
            $o = $triple;
        }
        $o = trim($o);
        //if property date and object an timespan we extract it with following special case
        if ($p == "date") {
            $o = str_replace("[", "", str_replace("]", "", $o));
            $o = str_replace("&ndash;", "-", $o);
        }
        // Do not allow empty Properties
        if (strlen($p) < 1) {
            continue;
        }
        if (in_array($p, $GLOBALS['W2RCFG']['ignoreProperties'])) {
            continue;
        }
        if ($o !== '' & $o !== NULL) {
            $pred = $p;
            // if(!$GLOBALS['templateStatistics'] && $GLOBALS['propertyStat'][$p]['count']<10)
            //continue;
            // predicate
            // Write properties CamelCase, no underscores, no hyphens. If first char is digit, add _ at the beginning
            $p = propertyToCamelCase($p);
            // Add prefixProperties if set true in config.inc.php
            if ($GLOBALS['prefixPropertiesWithTemplateName']) {
                $p = propertyToCamelCase($templateName) . '_' . $p;
            } else {
                if (!$equal) {
                    $p = propertyToCamelCase($templateName . "_" . $p);
                }
            }
            // object
            $o = str_replace('***', '|', $o);
            // Remove HTML Markup for whitespaces
            $o = str_replace('&nbsp;', ' ', $o);
            //remove <ref> Content</ref>
            //$o = preg_replace('/(<ref>.+?<\/ref>)/s','',$o);
            // Parse Subtemplates (only parse Subtemplates with values!)
            if (preg_match_all("/(\\{{2})([^\\}]+)(\\}{2})/", $o, $subTemplates, PREG_SET_ORDER)) {
                foreach ($subTemplates as $subTemplate) {
                    // Replace #### back to |, in order to parse subtemplate properly
                    $tpl = str_replace("####", "|", $subTemplate[2]);
                    // If subtemplate contains values, the subject is only the first word
                    if (preg_match("/(^[^\\|]+)(\\|)/", $tpl, $match)) {
                        $subTemplateSubject = $subject . '/' . $p . '/' . $match[1];
                    } else {
                        $subTemplateSubject = $subject . '/' . $p . '/' . $tpl;
                    }
                    // Look up URI in Array containing known URIs, if found add counter to URI.
                    // e.g. http://dbpedia.org/United_Kingdom/footnote/cite_web
                    // ==>  http://dbpedia.org/United_Kingdom/footnote/cite_web1 ...
                    if (!isset($knownSubTemplateURI[$subTemplateSubject])) {
                        // array_push( $knownSubTemplateURI, $subTemplateSubject );
                        $knownSubTemplateURI[$subTemplateSubject] = 0;
                    } else {
                        $knownSubTemplateURI[$subTemplateSubject]++;
                        $subTemplateSubject .= $knownSubTemplateURI[$subTemplateSubject];
                    }
                    // If subtemplate contained real values, write the corresponding triple
                    if (parseTemplate($subTemplateSubject, $tpl)) {
                        writeTripel($s, $GLOBALS['W2RCFG']['propertyBase'] . $p, $subTemplateSubject, 'main', 'r', null, null);
                    }
                }
            }
            // Remove subTemplates from Strings
            $o = str_replace("####", "|", $o);
            $o = preg_replace("/\\{{2}[^\\}]+\\}{2}/", "", $o);
            // Sometimes only whitespace remain, then continue with next triple
            if (preg_match("/^[\\s]*\$/", $o)) {
                continue;
            }
            //replace predicate if necessary to make them unambiguous
            $p = replacePredicate($p);
            // Add URI prefixes to property names
            $p = $GLOBALS['W2RCFG']['propertyBase'] . $p;
            if (isBlanknoteList($o)) {
                printList($s, $p, $o);
            } else {
                list($o, $o_is, $dtype, $lang) = parseAttributeValue($o, $s, $p, $language);
                // special newline handling
                $br = array('<br>', '<br/>', '<br />');
                if ($o_is == 'l') {
                    $o = str_replace($br, "\n", $o);
                } else {
                    if ($o_is == 'r') {
                        $o = str_replace($br, '', $o);
                    }
                }
                if ($o !== NULL) {
                    writeTripel($s, $p, $o, 'main', $o_is, $dtype, $lang);
                }
            }
            //if($GLOBALS['templateStatistics'] && $o!=NULL && $equal) {
            //	$GLOBALS['propertyStat'][$pred]['count']++;
            //	$GLOBALS['propertyStat'][$pred]['maxCountPerTemplate']=max($GLOBALS['propertyStat'][$pred]['maxCountPerTemplate'],++$pc[$pred]);
            //	if(!$GLOBALS['propertyStat'][$pred]['inTemplates'] || !in_array($templateName,$GLOBALS['propertyStat'][$pred]['inTemplates']))
            //		$GLOBALS['propertyStat'][$pred]['inTemplates'][]=$templateName;
            //}
            $extracted = true;
        }
    }
    if (isset($extracted) && $extracted) {
        //writeTripel($s,$GLOBALS['W2RCFG']['templateProperty'],$GLOBALS['W2RCFG']['wikipediaBase'].$GLOBALS['templateLabel'].':'.encodeLocalName($templateName),$GLOBALS['filedecisionTemplate']);
        writeTripel($s, $GLOBALS['W2RCFG']['templateProperty'], $GLOBALS['W2RCFG']['wikipediaBase'] . $GLOBALS['templateLabel'] . ':' . $templateName);
        //if ($GLOBALS['addExplicitTypeTriples'])
        //	printexplicitTyping($templateName,$GLOBALS['filename'],'t');
    }
    if (isset($extracted)) {
        return $extracted;
    } else {
        return false;
    }
}
Exemplo n.º 3
0
 /**
  * This function returns two results:
  * [triples: the generated triples, metaTriples: meta triples 2: the used templates]
  *
  * Unfortunately I haven't renamed the variables yet - result
  * is the array containing the result TRIPLES!! - its not this
  * 2 element.
  *
  */
 private function myGenerate(BreadCrumb $breadCrumb, $value)
 {
     // result is the array containing: triples, meta triples, used templates
     $result = array(array(), array(), array());
     $rootSubjectUri = RDFTriple::page($breadCrumb->getRoot());
     // this array is only relevant on depth 0
     $relatedClasses = array();
     //$metaTriples = array();
     //$usedTemplateNames = array();
     // 'parent' means the parent of the value - thus subject and predicate
     $parentName = $this->breadCrumbTransformer->transform($breadCrumb);
     $parentResource = RDFTriple::page($parentName);
     $parentPropertyName = null;
     $tmp = $breadCrumb->peekTop(0);
     if (isset($tmp)) {
         $parentPropertyName = $tmp->getPropertyName();
     }
     // Get all templates on this site, indexed by name
     // (there may be multiple templates with the same name)
     $nameToTemplates = SimpleWikiTemplateMatcher::match($value);
     //print_r($value);
     //print_r($nameToTemplates);
     //echo "NOW COMES THE STORM\n";
     foreach ($nameToTemplates as $templateName => $templates) {
         if (strlen($templateName) < 1) {
             continue;
         }
         //echo "GOT TEMPLATE NAME $templateName\n";
         $templateName = $this->mediaWikiUtil->toCanonicalWikiCase($templateName);
         if (!$this->templateNameFilter->doesAccept($templateName)) {
             continue;
         }
         $templateUri = RDFTriple::URI(DB_TEMPLATE_NS . $templateName, false);
         $result[2][$templateName] = 1;
         // Get annotations for the template - if there are any
         $lookupName = "Template:{$templateName}/doc";
         if ($breadCrumb->getDepth() == 0) {
             $ta = $this->templateDb->getTemplateAnnotation($lookupName);
             // Create the triples for "relatesToClass"
             // But only for the page itself (not for sub templates)
             // if no related class exists, default to rdf:type owl:Thing
             if (isset($ta)) {
                 foreach ($ta->getRelatedClasses() as $item) {
                     $relatedClasses[$item] = 1;
                 }
             }
         }
         foreach ($templates as $templateIndex => $template) {
             //echo "GOT TEMPLATE INDEX $templateIndex\n";
             // Iterate over all arguments
             $arguments = $template->getArguments();
             foreach ($arguments as $argumentName => $values) {
                 //echo "GOT ARGUMENT NAME $argumentName\n";
                 // propertyNs defaults to DB_PROPERTY_NS unless there
                 // exists a mapping in the templatedb. In that case it will
                 // be set to DB_ONTOLOGY_NS
                 $propertyNs = DB_PROPERTY_NS;
                 $pa = null;
                 if (isset($ta)) {
                     $pas = $ta->getPropertyAnnotations();
                     if (array_key_exists($argumentName, $pas)) {
                         $pa = $pas[$argumentName];
                         $propertyNs = DB_ONTOLOGY_NS;
                     }
                 }
                 //print_r($ta);
                 //echo "PROPERTY NS : $lookupName - $argumentName = $propertyNs\n";
                 // Fake a property mapping if there was none in the db
                 // This maps argumentName back to iteself
                 if (!isset($pa)) {
                     // If there was no mapping we might ignore it
                     // depending on an option (We can prevent this extractor
                     // to generate triples with properties in the
                     // dbp:property namespace
                     // We allow such triples on subResources though.
                     if ($this->allowUnmappedProperties != true && $breadCrumb->getDepth() == 0) {
                         continue;
                     }
                     // If there was no mapping, also rename numeric
                     // argument names (e.g. 1 becomes property1)
                     // this is just cosmetic for the result
                     if (is_numeric($argumentName)) {
                         $argumentName = "property{$argumentName}";
                     }
                     $pa = new PropertyAnnotation($argumentName);
                     $pa->addMapping(new PropertyMapping($argumentName));
                 }
                 foreach ($pa->getMappings() as $pm) {
                     $parseHint = $pm->getParseHint();
                     //echo "Mapping $argumentName : {$pm->getRenamedValue()}\n\n";
                     // if the renamed value is not set, use the original
                     // name
                     // otherwise use the mapped value
                     if (!isEmptyString($pm->getRenamedValue())) {
                         $argumentName = $pm->getRenamedValue();
                     }
                     $argumentName = trim($argumentName);
                     //echo "Mapping $argumentName : {$pm->getRenamedValue()}\n\n";
                     // Skip empty properties
                     // FIXME does that even happen?
                     if (strlen($argumentName) < 1) {
                         continue;
                     }
                     //echo "TN = $templateName, AN = $argumentName\n";
                     $childBreadcrumb = $breadCrumb->createClone();
                     $childBreadcrumb->push(new BreadcrumbNode($templateName, $templateIndex, $argumentName));
                     //$templateChildName = $this->breadcrumbToSubject($childBreadcrumb);
                     $templateChildName = $this->breadCrumbTransformer->transform($childBreadcrumb);
                     // If there is no parse hint we might be able to derive it
                     if (!isset($parseHint)) {
                         $parseHint = $this->deriveParseHintFromName($argumentName);
                     }
                     // Attempt to obtain a triple generator
                     $tripleGenerator = $this->getTripleGenerator($parseHint);
                     // If we DONT have a triple generator
                     // we fall through to default handling
                     $localResult = array(array(), array(), array());
                     if (isset($tripleGenerator)) {
                         foreach ($values as $valueIndex => $value) {
                             //echo "GOT VALUE $value\n";
                             $value = trim($value);
                             // Skip empty values
                             if ($value == "") {
                                 continue;
                             }
                             //echo "PROCESSING $templateChildName - $argumentName $value\n";
                             $tmp = $tripleGenerator->generate($templateChildName, $argumentName, $value);
                             $localResult[0] = array_merge($localResult[0], $tmp);
                             //echo "LOCALRESULT\n";
                             //print_r($localResult[0]);
                             //print_r($triples);
                             //echo "\nSigh\n";
                             //if(isset($triples))
                             //    $result = array_merge($result, $triples);
                         }
                         // append the generated triples
                         //continue;
                     } else {
                         // No parse hint - default handling
                         // if property date and object an timespan
                         // we extract it with following special case
                         $argumentName = propertyToCamelCase($argumentName);
                         $argumentName = encodeLocalName($argumentName);
                         if (in_array($argumentName, $GLOBALS['W2RCFG']['ignoreProperties'])) {
                             continue;
                         }
                         // turn the argument name into a property name
                         $propertyName = $propertyNs . $argumentName;
                         foreach ($values as $valueIndex => $value) {
                             $value = trim($value);
                             // Skip empty values
                             if ($value == "") {
                                 continue;
                             }
                             if ($argumentName == "date") {
                                 $value = str_replace("[", "", $value);
                                 $value = str_replace("]", "", $value);
                                 $value = str_replace("&ndash;", "-", $value);
                             }
                             // Parse out sub templates
                             // if something was extracted:
                             // .) connect subject with subsubject
                             // .) indicate usage at wikipage
                             $subResources = $this->myGenerate($childBreadcrumb, $value);
                             for ($i = 0; $i < 3; ++$i) {
                                 $localResult[$i] = array_merge($localResult[$i], $subResources[$i]);
                             }
                             //$result = array_merge($result, $triples);
                             //echo "GOT OBJECT $value\n";
                             $localResult[0] = array_merge($localResult[0], parseAttributeValueWrapper($value, $templateChildName, $propertyName, $this->language));
                             //$result = array_merge($result, $triples);
                         }
                     }
                     // For each triple add the ExtractedFromTemplate-Annotation
                     // Exclude triples with wikiPageUsesTemplate as predicate though
                     foreach ($localResult[0] as $triple) {
                         $triple->addExtractedFromTemplateAnnotation($templateUri);
                     }
                     // Add on delete cascade annotation
                     if ($breadCrumb->getDepth() > 1) {
                         foreach ($localResult[0] as $triple) {
                             $triple->addOnDeleteCascadeAnnotation($rootSubjectUri);
                         }
                     }
                     // merge the results
                     //for($i = 0; $i < 3; ++$i)
                     //    $result[$i] = array_merge($result[$i], $localResult[$i]);
                     //}
                     for ($i = 0; $i < 3; ++$i) {
                         $result[$i] = array_merge($result[$i], $localResult[$i]);
                     }
                 }
             }
             /*
             How to connect a sub-subject to the root subject?
                             if($breadCrumb->getDepth() == 0)
                continue;
             
                             // Create the parent-child connection
                             $parentChildTriple = new RDFtriple(
                $parentResource,
                RDFtriple::URI(DB_PROPERTY_NS . encodeLocalName($parentPropertyName), false),
                RDFtriple::page($templateChildName));
             
                             //$result[1][] = $parentChildTriple;
             */
         }
     }
     if (count($relatedClasses) > 0) {
         foreach ($relatedClasses as $relatedClass => $dummy) {
             $result[1][] = new RDFtriple($parentResource, RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(DB_ONTOLOGY_NS . $relatedClass, false));
         }
     } else {
         if ($breadCrumb->getDepth() == 0) {
             $result[1][] = new RDFtriple($parentResource, RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(OWL_THING, false));
         }
     }
     // Add the wiki page uses template triples - but only on depth 0
     if ($breadCrumb->getDepth() == 0) {
         foreach ($result[2] as $name => $dummy) {
             $result[1][] = new RDFTriple($parentResource, self::$wikiPageUsesTemplateUri, RDFTriple::URI(DB_TEMPLATE_NS . $name, false));
         }
     }
     $n = count($result[0]) + count($result[1]);
     $this->log(TRACE, "Generated a total of {$n} triples at {$breadCrumb}");
     foreach ($result[0] as $item) {
         $this->log(TRACE, $item);
     }
     foreach ($result[1] as $item) {
         $this->log(TRACE, $item);
     }
     return $result;
 }
Exemplo n.º 4
0
 if (in_array($p, $GLOBALS['W2RCFG']['ignoreProperties'])) {
     continue;
 }
 if ($o !== '' & $o !== NULL) {
     $pred = $p;
     // if(!$GLOBALS['templateStatistics'] && $GLOBALS['propertyStat'][$p]['count']<10)
     //continue;
     // predicate
     // Write properties CamelCase, no underscores, no hyphens. If first char is digit, add _ at the beginning
     $p = propertyToCamelCase($p);
     // Add prefixProperties if set true in config.inc.php
     if ($GLOBALS['prefixPropertiesWithTemplateName']) {
         $p = propertyToCamelCase($templateName) . '_' . $p;
     } else {
         if (!$equal) {
             $p = propertyToCamelCase($templateName . "_" . $p);
         }
     }
     // object
     $o = str_replace('***', '|', $o);
     // Remove HTML Markup for whitespaces
     $o = str_replace('&nbsp;', ' ', $o);
     //remove <ref> Content</ref>
     //$o = preg_replace('/(<ref>.+?<\/ref>)/s','',$o);
     // Parse Subtemplates (only parse Subtemplates with values!)
     if (preg_match_all("/(\\{{2})([^\\}]+)(\\}{2})/", $o, $subTemplates, PREG_SET_ORDER)) {
         foreach ($subTemplates as $subTemplate) {
             // Replace #### back to |, in order to parse subtemplate properly
             $tpl = str_replace("####", "|", $subTemplate[2]);
             // If subtemplate contains values, the subject is only the first word
             if (preg_match("/(^[^\\|]+)(\\|)/", $tpl, $match)) {