public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $pageID = encodeLocalName($pageID); // Remove Template as this is already extracted by the Infobox Extractor // Find subtemplates and remove Subtemplates, which are listed as ignored! preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $pageSource, $subTemplates); foreach ($subTemplates[0] as $key => $subTemplate) { $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate); // Cut Brackets / {} $pageSource = str_replace('{{' . $subTemplate . '}}', '', $pageSource); } // Extract internal Semantic Links $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-z0-9\\- _]+)(::)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate(encodeLocalName($match[2])), RDFtriple::page($match[4])); } // Extract Literals $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-Z\\-_ ]+)(:=)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $triple = array(); $triple = parseAttributeValue($match[4], $pageID, $match[2]); // object, object_is, datatype(, language) $lexicalForm = $triple[0]; $datatype = $triple[2]; $predicate = propertyToCamelCase(encodeLocalName($match[2])); // Continue if empty String if ($lexicalForm == null) { continue; } $result->addTriple(RDFtriple::page($pageID), RDFTriple::predicate($predicate), RDFtriple::literal($lexicalForm, $datatype, 'en')); } return $result; }
/** * This function is legacy Code and should be removed in the future * * * Speichert Ausgabe in Variable, die sp‰ter in Datei geschrieben wird * * Diese Funktion wird nur benutzt, wenn als Wert eines Pr‰dikates mehrere Blanknotes erscheinen * Es wird f¸r jede Blanknote eine Zeile in der Ausgabe generiert * * @param string $page aktuelle Wikiseite bzw. aktuell bearbeitete Blanknote * @param string $p Pr‰dikat, welches als Objekt eine Blanknoteliste hat * @param string $o Blanknoteliste der Form _:a1;_:a2; * @param string $propertyBase Standarduri * @param string $template aktuelles Template */ function printList($s, $p, $o) { $o = explode(';', $o); foreach ($o as $object) { if (strlen($object) > 1) { list($ob, $ob_is, $dtype, $lang) = parseAttributeValue(trim($object), $s, $p); if ($ob) { writeTripel($s, $p, trim(str_replace("\n", '', $ob)), 'main', $ob_is); } } } }
function parseTemplate($subject, $template, $language = NULL) { // If template/subTemplate is listed as ignored, return false if (isIgnored($template, $tplName)) { return false; } // Find subtemplates and remove Subtemplates, which are listed as ignored! preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $template, $subTemplates); foreach ($subTemplates[0] as $key => $subTemplate) { $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate); // Cut Brackets / {} if (isIgnored($subTemplate, $tplName)) { $template = str_replace('{{' . $subTemplate . '}}', '', $template); } } // Replace "|" inside subtemplates with "\\" to avoid splitting them like triples $template = preg_replace_callback("/(\\{{2})([^\\}\\|]+)(\\|)([^\\}]+)(\\}{2})/", 'replaceBarInSubtemplate', $template); $equal = preg_match('~=~', $template); // Gruppe=[[Gruppe-3-Element|3]] ersetzt durch Gruppe=[[Gruppe-3-Element***3]] do { $template = preg_replace('/\\[\\[([^\\]]+)\\|([^\\]]*)\\]\\]/', '[[\\1***\\2]]', $template, -1, $count); } while ($count); $triples = explode('|', $template); if (count($triples) <= $GLOBALS['W2RCFG']['minAttributeCount']) { return false; } $templateName = strtolower(trim(array_shift($triples))); // if(!isBlanknote($subject) && !$GLOBALS['onefile']) // $GLOBALS['filename']=urlencode($templateName).'.'.$GLOBALS['outputFormat']; // Array containing URIs to subtemplates. If the same URI is in use already, add a number to it $knownSubTemplateURI = array(); // subject $s = $subject; $z = 0; foreach ($triples as $triple) { if ($equal) { $split = explode('=', $triple, 2); if (count($split) < 2) { continue; } list($p, $o) = $split; $p = trim($p); } else { $p = "property" . ++$z; $o = $triple; } $o = trim($o); //if property date and object an timespan we extract it with following special case if ($p == "date") { $o = str_replace("[", "", str_replace("]", "", $o)); $o = str_replace("–", "-", $o); } // Do not allow empty Properties if (strlen($p) < 1) { continue; } if (in_array($p, $GLOBALS['W2RCFG']['ignoreProperties'])) { continue; } if ($o !== '' & $o !== NULL) { $pred = $p; // if(!$GLOBALS['templateStatistics'] && $GLOBALS['propertyStat'][$p]['count']<10) //continue; // predicate // Write properties CamelCase, no underscores, no hyphens. If first char is digit, add _ at the beginning $p = propertyToCamelCase($p); // Add prefixProperties if set true in config.inc.php if ($GLOBALS['prefixPropertiesWithTemplateName']) { $p = propertyToCamelCase($templateName) . '_' . $p; } else { if (!$equal) { $p = propertyToCamelCase($templateName . "_" . $p); } } // object $o = str_replace('***', '|', $o); // Remove HTML Markup for whitespaces $o = str_replace(' ', ' ', $o); //remove <ref> Content</ref> //$o = preg_replace('/(<ref>.+?<\/ref>)/s','',$o); // Parse Subtemplates (only parse Subtemplates with values!) if (preg_match_all("/(\\{{2})([^\\}]+)(\\}{2})/", $o, $subTemplates, PREG_SET_ORDER)) { foreach ($subTemplates as $subTemplate) { // Replace #### back to |, in order to parse subtemplate properly $tpl = str_replace("####", "|", $subTemplate[2]); // If subtemplate contains values, the subject is only the first word if (preg_match("/(^[^\\|]+)(\\|)/", $tpl, $match)) { $subTemplateSubject = $subject . '/' . $p . '/' . $match[1]; } else { $subTemplateSubject = $subject . '/' . $p . '/' . $tpl; } // Look up URI in Array containing known URIs, if found add counter to URI. // e.g. http://dbpedia.org/United_Kingdom/footnote/cite_web // ==> http://dbpedia.org/United_Kingdom/footnote/cite_web1 ... if (!isset($knownSubTemplateURI[$subTemplateSubject])) { // array_push( $knownSubTemplateURI, $subTemplateSubject ); $knownSubTemplateURI[$subTemplateSubject] = 0; } else { $knownSubTemplateURI[$subTemplateSubject]++; $subTemplateSubject .= $knownSubTemplateURI[$subTemplateSubject]; } // If subtemplate contained real values, write the corresponding triple if (parseTemplate($subTemplateSubject, $tpl)) { writeTripel($s, $GLOBALS['W2RCFG']['propertyBase'] . $p, $subTemplateSubject, 'main', 'r', null, null); } } } // Remove subTemplates from Strings $o = str_replace("####", "|", $o); $o = preg_replace("/\\{{2}[^\\}]+\\}{2}/", "", $o); // Sometimes only whitespace remain, then continue with next triple if (preg_match("/^[\\s]*\$/", $o)) { continue; } //replace predicate if necessary to make them unambiguous $p = replacePredicate($p); // Add URI prefixes to property names $p = $GLOBALS['W2RCFG']['propertyBase'] . $p; if (isBlanknoteList($o)) { printList($s, $p, $o); } else { list($o, $o_is, $dtype, $lang) = parseAttributeValue($o, $s, $p, $language); // special newline handling $br = array('<br>', '<br/>', '<br />'); if ($o_is == 'l') { $o = str_replace($br, "\n", $o); } else { if ($o_is == 'r') { $o = str_replace($br, '', $o); } } if ($o !== NULL) { writeTripel($s, $p, $o, 'main', $o_is, $dtype, $lang); } } //if($GLOBALS['templateStatistics'] && $o!=NULL && $equal) { // $GLOBALS['propertyStat'][$pred]['count']++; // $GLOBALS['propertyStat'][$pred]['maxCountPerTemplate']=max($GLOBALS['propertyStat'][$pred]['maxCountPerTemplate'],++$pc[$pred]); // if(!$GLOBALS['propertyStat'][$pred]['inTemplates'] || !in_array($templateName,$GLOBALS['propertyStat'][$pred]['inTemplates'])) // $GLOBALS['propertyStat'][$pred]['inTemplates'][]=$templateName; //} $extracted = true; } } if (isset($extracted) && $extracted) { //writeTripel($s,$GLOBALS['W2RCFG']['templateProperty'],$GLOBALS['W2RCFG']['wikipediaBase'].$GLOBALS['templateLabel'].':'.encodeLocalName($templateName),$GLOBALS['filedecisionTemplate']); writeTripel($s, $GLOBALS['W2RCFG']['templateProperty'], $GLOBALS['W2RCFG']['wikipediaBase'] . $GLOBALS['templateLabel'] . ':' . $templateName); //if ($GLOBALS['addExplicitTypeTriples']) // printexplicitTyping($templateName,$GLOBALS['filename'],'t'); } if (isset($extracted)) { return $extracted; } else { return false; } }
/** * A wrapper for parseAttributeValue * * * @global <type> $parseResult * @param <type> $value * @param <type> $templateChildName * @param <type> $propertyName * @param <type> $language * @return <type> */ function parseAttributeValueWrapper($value, $templateChildName, $propertyName, $language) { $result = array(); global $parseResult; $parseResult = null; $localResult = parseAttributeValue($value, $templateChildName, $propertyName, $language); $items = array(); // remap local and global results into a uniform schema if (isset($parseResult)) { foreach ($parseResult as $item) { list(, , $o, $ot, $dt, $ol) = $item; $items[] = array($o, $ot, $dt, $ol); } } $parseResult = null; if (isset($localResult)) { list($o, $ot, $dt, $ol) = $localResult; $items[] = array($o, $ot, $dt, $ol); } foreach ($items as $item) { $object = $item[0]; $objectType = $item[1]; $dataType = $item[2]; $objectLanguage = $item[3]; // And another hack... we pass the language to the // parse function, and we don't get it back... if (!isset($objectLanguage)) { $objectLanguage = $language; } // special newline handling $br = array('<br>', '<br/>', '<br />'); if ($objectType == 'l') { $object = str_replace($br, "\n", $object); } else { if ($objectType == 'r') { $object = str_replace($br, '', $object); } } //echo "Got object type '$objectType'\n"; if ($objectType == "r") { $object = RDFtriple::URI($object); } else { if ($objectType == "l") { $object = RDFtriple::Literal($object, $dataType, $objectLanguage); } else { Logger::warn("Shouldn't happen - found a blank node where none expected - objectType = {$objectType}"); continue; } } $result[] = new RDFtriple(RDFtriple::page($templateChildName), RDFtriple::URI($propertyName), $object); } return $result; }
/** * Parses numbers, with additional year behind, e.g.; numEmployees = 12,380 (2006) * Or an external link as reference, e.g.: revenue = 23 billion $ [http://moneyfacts.com] * */ function catchNumberWithReference($o, $s, $p) { // echo "\nNWR: $o"; // Matches numbers / units followed by year reference if (preg_match("/(^[0-9,\\.\$£€¥ ]+((b|B)illion|(m|M)illion)?|((T|t)rillion)?|((Q|q)uadrillion)?)([\\s]*\\([0-9]{4}\\))(.*)/", $o, $match)) { //$o = preg_replace("/\([0-9]{4}\)/","",$o); $o = trim($match[1]); // parseAttributeValue } else { if (preg_match("/(^[0-9,\\.\$£€¥ ]+((b|B)illion|(m|M)illion)?|((T|t)rillion)?|((Q|q)uadrillion)?)([\\s]*\\[http:\\/\\/[^\\]]+\\].*)/", $o, $match)) { $o = trim($match[1]); } } if ($match) { list($o, $o_is, $dtype, $lang) = parseAttributeValue($o, $s, $p); if ($o !== NULL) { writeTripel($s, $p, $o, 'main', $o_is, $dtype, $lang); } return true; } return false; }