示例#1
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $pageID = encodeLocalName($pageID);
     // Remove Template as this is already extracted by the Infobox Extractor
     // Find subtemplates and remove Subtemplates, which are listed as ignored!
     preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $pageSource, $subTemplates);
     foreach ($subTemplates[0] as $key => $subTemplate) {
         $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate);
         // Cut Brackets / {}
         $pageSource = str_replace('{{' . $subTemplate . '}}', '', $pageSource);
     }
     // Extract internal Semantic Links
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-z0-9\\- _]+)(::)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate(encodeLocalName($match[2])), RDFtriple::page($match[4]));
     }
     // Extract Literals
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-Z\\-_ ]+)(:=)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $triple = array();
         $triple = parseAttributeValue($match[4], $pageID, $match[2]);
         // object, object_is, datatype(, language)
         $lexicalForm = $triple[0];
         $datatype = $triple[2];
         $predicate = propertyToCamelCase(encodeLocalName($match[2]));
         // Continue if empty String
         if ($lexicalForm == null) {
             continue;
         }
         $result->addTriple(RDFtriple::page($pageID), RDFTriple::predicate($predicate), RDFtriple::literal($lexicalForm, $datatype, 'en'));
     }
     return $result;
 }
示例#2
0
/**
 * This function is legacy Code and should be removed in the future
 *
 *
* Speichert Ausgabe in Variable, die sp‰ter in Datei geschrieben wird
*
* Diese Funktion wird nur benutzt, wenn als Wert eines Pr‰dikates mehrere Blanknotes erscheinen
* Es wird f¸r jede Blanknote eine Zeile in der Ausgabe generiert
*
* @param	string	$page	aktuelle Wikiseite bzw. aktuell bearbeitete Blanknote
* @param	string	$p	Pr‰dikat, welches als Objekt eine Blanknoteliste hat
* @param	string	$o	Blanknoteliste der Form _:a1;_:a2;
* @param	string	$propertyBase	Standarduri
* @param	string	$template	aktuelles Template
*/
function printList($s, $p, $o)
{
    $o = explode(';', $o);
    foreach ($o as $object) {
        if (strlen($object) > 1) {
            list($ob, $ob_is, $dtype, $lang) = parseAttributeValue(trim($object), $s, $p);
            if ($ob) {
                writeTripel($s, $p, trim(str_replace("\n", '', $ob)), 'main', $ob_is);
            }
        }
    }
}
示例#3
0
function parseTemplate($subject, $template, $language = NULL)
{
    // If template/subTemplate is listed as ignored, return false
    if (isIgnored($template, $tplName)) {
        return false;
    }
    // Find subtemplates and remove Subtemplates, which are listed as ignored!
    preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $template, $subTemplates);
    foreach ($subTemplates[0] as $key => $subTemplate) {
        $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate);
        // Cut Brackets / {}
        if (isIgnored($subTemplate, $tplName)) {
            $template = str_replace('{{' . $subTemplate . '}}', '', $template);
        }
    }
    // Replace "|" inside subtemplates with "\\" to avoid splitting them like triples
    $template = preg_replace_callback("/(\\{{2})([^\\}\\|]+)(\\|)([^\\}]+)(\\}{2})/", 'replaceBarInSubtemplate', $template);
    $equal = preg_match('~=~', $template);
    // Gruppe=[[Gruppe-3-Element|3]]  ersetzt durch Gruppe=[[Gruppe-3-Element***3]]
    do {
        $template = preg_replace('/\\[\\[([^\\]]+)\\|([^\\]]*)\\]\\]/', '[[\\1***\\2]]', $template, -1, $count);
    } while ($count);
    $triples = explode('|', $template);
    if (count($triples) <= $GLOBALS['W2RCFG']['minAttributeCount']) {
        return false;
    }
    $templateName = strtolower(trim(array_shift($triples)));
    //	if(!isBlanknote($subject) && !$GLOBALS['onefile'])
    //		$GLOBALS['filename']=urlencode($templateName).'.'.$GLOBALS['outputFormat'];
    // Array containing URIs to subtemplates. If the same URI is in use already, add a number to it
    $knownSubTemplateURI = array();
    // subject
    $s = $subject;
    $z = 0;
    foreach ($triples as $triple) {
        if ($equal) {
            $split = explode('=', $triple, 2);
            if (count($split) < 2) {
                continue;
            }
            list($p, $o) = $split;
            $p = trim($p);
        } else {
            $p = "property" . ++$z;
            $o = $triple;
        }
        $o = trim($o);
        //if property date and object an timespan we extract it with following special case
        if ($p == "date") {
            $o = str_replace("[", "", str_replace("]", "", $o));
            $o = str_replace("&ndash;", "-", $o);
        }
        // Do not allow empty Properties
        if (strlen($p) < 1) {
            continue;
        }
        if (in_array($p, $GLOBALS['W2RCFG']['ignoreProperties'])) {
            continue;
        }
        if ($o !== '' & $o !== NULL) {
            $pred = $p;
            // if(!$GLOBALS['templateStatistics'] && $GLOBALS['propertyStat'][$p]['count']<10)
            //continue;
            // predicate
            // Write properties CamelCase, no underscores, no hyphens. If first char is digit, add _ at the beginning
            $p = propertyToCamelCase($p);
            // Add prefixProperties if set true in config.inc.php
            if ($GLOBALS['prefixPropertiesWithTemplateName']) {
                $p = propertyToCamelCase($templateName) . '_' . $p;
            } else {
                if (!$equal) {
                    $p = propertyToCamelCase($templateName . "_" . $p);
                }
            }
            // object
            $o = str_replace('***', '|', $o);
            // Remove HTML Markup for whitespaces
            $o = str_replace('&nbsp;', ' ', $o);
            //remove <ref> Content</ref>
            //$o = preg_replace('/(<ref>.+?<\/ref>)/s','',$o);
            // Parse Subtemplates (only parse Subtemplates with values!)
            if (preg_match_all("/(\\{{2})([^\\}]+)(\\}{2})/", $o, $subTemplates, PREG_SET_ORDER)) {
                foreach ($subTemplates as $subTemplate) {
                    // Replace #### back to |, in order to parse subtemplate properly
                    $tpl = str_replace("####", "|", $subTemplate[2]);
                    // If subtemplate contains values, the subject is only the first word
                    if (preg_match("/(^[^\\|]+)(\\|)/", $tpl, $match)) {
                        $subTemplateSubject = $subject . '/' . $p . '/' . $match[1];
                    } else {
                        $subTemplateSubject = $subject . '/' . $p . '/' . $tpl;
                    }
                    // Look up URI in Array containing known URIs, if found add counter to URI.
                    // e.g. http://dbpedia.org/United_Kingdom/footnote/cite_web
                    // ==>  http://dbpedia.org/United_Kingdom/footnote/cite_web1 ...
                    if (!isset($knownSubTemplateURI[$subTemplateSubject])) {
                        // array_push( $knownSubTemplateURI, $subTemplateSubject );
                        $knownSubTemplateURI[$subTemplateSubject] = 0;
                    } else {
                        $knownSubTemplateURI[$subTemplateSubject]++;
                        $subTemplateSubject .= $knownSubTemplateURI[$subTemplateSubject];
                    }
                    // If subtemplate contained real values, write the corresponding triple
                    if (parseTemplate($subTemplateSubject, $tpl)) {
                        writeTripel($s, $GLOBALS['W2RCFG']['propertyBase'] . $p, $subTemplateSubject, 'main', 'r', null, null);
                    }
                }
            }
            // Remove subTemplates from Strings
            $o = str_replace("####", "|", $o);
            $o = preg_replace("/\\{{2}[^\\}]+\\}{2}/", "", $o);
            // Sometimes only whitespace remain, then continue with next triple
            if (preg_match("/^[\\s]*\$/", $o)) {
                continue;
            }
            //replace predicate if necessary to make them unambiguous
            $p = replacePredicate($p);
            // Add URI prefixes to property names
            $p = $GLOBALS['W2RCFG']['propertyBase'] . $p;
            if (isBlanknoteList($o)) {
                printList($s, $p, $o);
            } else {
                list($o, $o_is, $dtype, $lang) = parseAttributeValue($o, $s, $p, $language);
                // special newline handling
                $br = array('<br>', '<br/>', '<br />');
                if ($o_is == 'l') {
                    $o = str_replace($br, "\n", $o);
                } else {
                    if ($o_is == 'r') {
                        $o = str_replace($br, '', $o);
                    }
                }
                if ($o !== NULL) {
                    writeTripel($s, $p, $o, 'main', $o_is, $dtype, $lang);
                }
            }
            //if($GLOBALS['templateStatistics'] && $o!=NULL && $equal) {
            //	$GLOBALS['propertyStat'][$pred]['count']++;
            //	$GLOBALS['propertyStat'][$pred]['maxCountPerTemplate']=max($GLOBALS['propertyStat'][$pred]['maxCountPerTemplate'],++$pc[$pred]);
            //	if(!$GLOBALS['propertyStat'][$pred]['inTemplates'] || !in_array($templateName,$GLOBALS['propertyStat'][$pred]['inTemplates']))
            //		$GLOBALS['propertyStat'][$pred]['inTemplates'][]=$templateName;
            //}
            $extracted = true;
        }
    }
    if (isset($extracted) && $extracted) {
        //writeTripel($s,$GLOBALS['W2RCFG']['templateProperty'],$GLOBALS['W2RCFG']['wikipediaBase'].$GLOBALS['templateLabel'].':'.encodeLocalName($templateName),$GLOBALS['filedecisionTemplate']);
        writeTripel($s, $GLOBALS['W2RCFG']['templateProperty'], $GLOBALS['W2RCFG']['wikipediaBase'] . $GLOBALS['templateLabel'] . ':' . $templateName);
        //if ($GLOBALS['addExplicitTypeTriples'])
        //	printexplicitTyping($templateName,$GLOBALS['filename'],'t');
    }
    if (isset($extracted)) {
        return $extracted;
    } else {
        return false;
    }
}
示例#4
0
/**
 * A wrapper for parseAttributeValue
 *
 *
 * @global <type> $parseResult
 * @param <type> $value
 * @param <type> $templateChildName
 * @param <type> $propertyName
 * @param <type> $language
 * @return <type>
 */
function parseAttributeValueWrapper($value, $templateChildName, $propertyName, $language)
{
    $result = array();
    global $parseResult;
    $parseResult = null;
    $localResult = parseAttributeValue($value, $templateChildName, $propertyName, $language);
    $items = array();
    // remap local and global results into a uniform schema
    if (isset($parseResult)) {
        foreach ($parseResult as $item) {
            list(, , $o, $ot, $dt, $ol) = $item;
            $items[] = array($o, $ot, $dt, $ol);
        }
    }
    $parseResult = null;
    if (isset($localResult)) {
        list($o, $ot, $dt, $ol) = $localResult;
        $items[] = array($o, $ot, $dt, $ol);
    }
    foreach ($items as $item) {
        $object = $item[0];
        $objectType = $item[1];
        $dataType = $item[2];
        $objectLanguage = $item[3];
        // And another hack... we pass the language to the
        // parse function, and we don't get it back...
        if (!isset($objectLanguage)) {
            $objectLanguage = $language;
        }
        // special newline handling
        $br = array('<br>', '<br/>', '<br />');
        if ($objectType == 'l') {
            $object = str_replace($br, "\n", $object);
        } else {
            if ($objectType == 'r') {
                $object = str_replace($br, '', $object);
            }
        }
        //echo "Got object type '$objectType'\n";
        if ($objectType == "r") {
            $object = RDFtriple::URI($object);
        } else {
            if ($objectType == "l") {
                $object = RDFtriple::Literal($object, $dataType, $objectLanguage);
            } else {
                Logger::warn("Shouldn't happen - found a blank node where none expected - objectType = {$objectType}");
                continue;
            }
        }
        $result[] = new RDFtriple(RDFtriple::page($templateChildName), RDFtriple::URI($propertyName), $object);
    }
    return $result;
}
/**
 * Parses numbers, with additional year behind, e.g.; numEmployees = 12,380 (2006)
 * Or an external link as reference, e.g.: revenue = 23 billion $ [http://moneyfacts.com]
 *
 */
function catchNumberWithReference($o, $s, $p)
{
    // echo "\nNWR: $o";
    // Matches numbers / units followed by year reference
    if (preg_match("/(^[0-9,\\.\$£€¥ ]+((b|B)illion|(m|M)illion)?|((T|t)rillion)?|((Q|q)uadrillion)?)([\\s]*\\([0-9]{4}\\))(.*)/", $o, $match)) {
        //$o = preg_replace("/\([0-9]{4}\)/","",$o);
        $o = trim($match[1]);
        // parseAttributeValue
    } else {
        if (preg_match("/(^[0-9,\\.\$£€¥ ]+((b|B)illion|(m|M)illion)?|((T|t)rillion)?|((Q|q)uadrillion)?)([\\s]*\\[http:\\/\\/[^\\]]+\\].*)/", $o, $match)) {
            $o = trim($match[1]);
        }
    }
    if ($match) {
        list($o, $o_is, $dtype, $lang) = parseAttributeValue($o, $s, $p);
        if ($o !== NULL) {
            writeTripel($s, $p, $o, 'main', $o_is, $dtype, $lang);
        }
        return true;
    }
    return false;
}