function extractCategoryRelationships() { $res = mysql_query('SELECT page_title,cl_to FROM page INNER JOIN categorylinks ON(page_id=cl_from) WHERE page_namespace=14'); while ($row = mysql_fetch_array($res)) { if (empty($GLOBALS['W2RCFG']['categories']) && empty($GLOBALS['W2RCFG']['categoriesPattern']) || preg_match('~^' . implode($GLOBALS['W2RCFG']['categories'], '|') . '$~i', decodeLocalName($row[0])) || fnmatch(implode($GLOBALS['W2RCFG']['categoriesPattern'], '|'), decodeLocalName($row[0]))) { writeTripel($GLOBALS['W2RCFG']['wikipediaBase'] . $GLOBALS['categoryLabel'] . ':' . $row[0], $GLOBALS['W2RCFG']['subCategoryProperty'], $GLOBALS['W2RCFG']['wikipediaBase'] . $GLOBALS['categoryLabel'] . ':' . $row[1]); } } }
public function parseValue($object, $subject, $predicate, &$extractor, $language = NULL) { $dtype = null; $object_is = 'l'; if (isBlanknote($object)) { $object_is = 'b'; $object = str_replace(";", "", $object); } else { if (isInt($object)) { $dtype = 'http://www.w3.org/2001/XMLSchema#integer'; } else { if (isIntwithComma($object)) { $object = str_replace(",", "", $object); $dtype = 'http://www.w3.org/2001/XMLSchema#integer'; } else { if (isFloat($object)) { $dtype = 'http://www.w3.org/2001/XMLSchema#decimal'; } else { if (catchPictureURI($object, $subject)) { $object_is = 'r'; $image = substr($object, strrpos($object, '/') + 1); $wikipediaImageDescription = 'http://' . $language . '.wikipedia.org/wiki/Image:' . $image; writeTripel($object, 'http://purl.org/dc/terms/rights', $wikipediaImageDescription, 'main', $object_is); } else { if (catchMonthYear($object)) { $dtype = 'http://www.w3.org/2001/XMLSchema#gYearMonth'; } else { if (catchDate($object)) { $dtype = 'http://www.w3.org/2001/XMLSchema#date'; } else { if (catchYear($object)) { $dtype = 'http://www.w3.org/2001/XMLSchema#gYear'; } else { if (catchRank($object)) { $dtype = $GLOBALS['W2RCFG']['w2ruri'] . 'Rank'; } else { if (catchLargeNumber($object)) { $dtype = 'http://www.w3.org/2001/XMLSchema#integer'; } else { if ($dtype = catchLargeMoney($object)) { } else { if ($dtype = catchMoneyWoCent($object)) { } else { if ($dtype = catchMoney($object)) { } else { if (catchPercent($object)) { $dtype = $GLOBALS['W2RCFG']['w2ruri'] . 'Percent'; } else { if ($dtype = catchUnited($object)) { } else { if (catchLink($object)) { $object_is = 'r'; } else { if (catchLinkList($object, $subject, $predicate, $dtype, $extractor)) { return null; } else { if ($list = catchExternalLink($object)) { // $list = catchExternalLink($object); foreach ($list[1] as $l) { if (strlen($l) > 1) { $l = explode(" ", $l); $object = $l[0]; $object_is = 'r'; writeTripel($subject, $predicate, $object, 'main', $object_is); unset($object); } } return null; } else { if (catchNumberWithReference($object, $subject, $predicate, $extractor)) { return null; } else { removeWikiCode($object); } } } } } } } } } } } } } } } } } } } //if ($GLOBALS['addExplicitTypeTriples']) // printexplicitTyping($predicate,$GLOBALS['filename'],'p',$object_is); //if ($GLOBALS['addExplicitTypeTriples']&&$GLOBALS['correctPropertyType']) // $object_is=printexplicitTyping($predicate,$GLOBALS['filename'],'p',$object_is); if (strlen(trim($object)) < 1) { return null; } return array($object, $object_is, $dtype, $language); }
function printexplicitTyping($name, $filename, $name_is, $object_is = 'n') { static $namearray = array(); static $predicatetypearray = array(); if ($name_is == 'c') { $save = $name . ':Cat'; } if ($name_is == 't') { $save = $name . ':Temp'; } if ($name_is == 'p') { $save = $name . ':Pred'; } if (!arrayMultiSearch($save, $namearray[$filename])) { $namearray[$filename][] = $save; if ($object_is != 'n') { $predicatetypearray[$save][$filename]['is'] = $object_is; } $filedecisionTemplate = $GLOBALS['rdftypeProperty'] != $GLOBALS['W2RCFG']['templateProperty'] ? 'main' : 'type'; $filedecisionCategory = $GLOBALS['rdftypeProperty'] != $GLOBALS['W2RCFG']['categoryProperty'] ? 'main' : 'type'; if ($name_is == 'c' && $filedecisionCategory == 'type') { writeTripel($name, $GLOBALS['W2RCFG']['categoryProperty'], $GLOBALS['W2RCFG']['classBase'], 'type'); } if ($name_is == 'c' && $filedecisionCategory == 'main') { printexplicitTyping($GLOBALS['W2RCFG']['categoryProperty'], $filename, 'p', 'r'); } if ($name_is == 't' && $filedecisionTemplate == 'type') { writeTripel($GLOBALS['W2RCFG']['wikipediaBase'] . $GLOBALS['templateLabel'] . ':' . $name, $GLOBALS['rdftypeProperty'], $GLOBALS['W2RCFG']['classBase'], 'type'); } if ($name_is == 't' && $filedecisionTemplate == 'main') { printexplicitTyping($GLOBALS['W2RCFG']['templateProperty'], $filename, 'p', 'r'); } if ($name_is == 'p') { writeTripel($name, $GLOBALS['rdftypeProperty'], $object_is == 'l' ? $GLOBALS['W2RCFG']['datatypePropertyBase'] : $GLOBALS['W2RCFG']['objectPropertyBase'], 'type'); } return; } else { return $predicatetypearray[$save][$filename]['is']; } }
//$GLOBALS['W2RCFG']['allowedtags'] = $GLOBALS['W2RCFG']['allowedtags']."<ref> </ref>"; $tpl = strip_tags($tpl, $GLOBALS['W2RCFG']['allowedtags']); //$GLOBALS['W2RCFG']['allowedtags'] = str_replace("<ref>","",$GLOBALS['W2RCFG']['allowedtags']); //$GLOBALS['W2RCFG']['allowedtags'] = str_replace("</ref>","",$GLOBALS['W2RCFG']['allowedtags']); if ($templateCount > 1 && strlen($tmpTemplateName) > 1) { if (!isset($knownTemplates[$tmpTemplateName])) { $knownTemplates[$tmpTemplateName] = 1; } else { $knownTemplates[$tmpTemplateName]++; } $subject = $GLOBALS['W2RCFG']['wikipediaBase'] . $page . '/' . $tmpTemplateName . $knownTemplates[$tmpTemplateName]; ////////////////////////////////////// // Call function parseTemplate ////////////////////////////////////// if ($extracted = $this->parseTemplate($subject, $tpl, $language)) { writeTripel($GLOBALS['W2RCFG']['wikipediaBase'] . $page, $GLOBALS['W2RCFG']['propertyBase'] . 'relatedInstance', $subject, 'r'); if (isset($tplCount[$tplName])) { $tplCount[$tplName]++; } else { $tplCount[$tplName] = 1; } } } else { $subject = $GLOBALS['W2RCFG']['wikipediaBase'] . $page; ////////////////////////////////////// // Call function parseTemplate ////////////////////////////////////// if ($extracted = $this->parseTemplate($subject, $tpl, $language)) { if (isset($tplCount[$tplName])) { $tplCount[$tplName]++; } else {
printList($s, $p, $o); } else { list($o, $o_is, $dtype, $lang) = $this->parseAttributeValue($o, $s, $p, $language); // special newline handling $br = array('<br>', '<br/>', '<br />'); if ($o_is == 'l') { $o = str_replace($br, "\n", $o); } else { if ($o_is == 'r') { $o = str_replace($br, '', $o); } } if ($o !== NULL) { writeTripel($s, $p, $o, 'main', $o_is, $dtype, $lang); } } //if($GLOBALS['templateStatistics'] && $o!=NULL && $equal) { // $GLOBALS['propertyStat'][$pred]['count']++; // $GLOBALS['propertyStat'][$pred]['maxCountPerTemplate']=max($GLOBALS['propertyStat'][$pred]['maxCountPerTemplate'],++$pc[$pred]); // if(!$GLOBALS['propertyStat'][$pred]['inTemplates'] || !in_array($templateName,$GLOBALS['propertyStat'][$pred]['inTemplates'])) // $GLOBALS['propertyStat'][$pred]['inTemplates'][]=$templateName; //} $extracted = true; } } if (isset($extracted) && $extracted) { //writeTripel($s,$GLOBALS['W2RCFG']['templateProperty'],$GLOBALS['W2RCFG']['wikipediaBase'].$GLOBALS['templateLabel'].':'.encodeLocalName($templateName),$GLOBALS['filedecisionTemplate']); writeTripel($s, $GLOBALS['W2RCFG']['templateProperty'], $GLOBALS['W2RCFG']['wikipediaBase'] . $GLOBALS['templateLabel'] . ':' . $templateName); //if ($GLOBALS['addExplicitTypeTriples']) // printexplicitTyping($templateName,$GLOBALS['filename'],'t'); }
/** * Parses numbers, with additional year behind, e.g.; numEmployees = 12,380 (2006) * Or an external link as reference, e.g.: revenue = 23 billion $ [http://moneyfacts.com] * */ function catchNumberWithReference($o, $s, $p) { // echo "\nNWR: $o"; // Matches numbers / units followed by year reference if (preg_match("/(^[0-9,\\.\$£€¥ ]+((b|B)illion|(m|M)illion)?|((T|t)rillion)?|((Q|q)uadrillion)?)([\\s]*\\([0-9]{4}\\))(.*)/", $o, $match)) { //$o = preg_replace("/\([0-9]{4}\)/","",$o); $o = trim($match[1]); // parseAttributeValue } else { if (preg_match("/(^[0-9,\\.\$£€¥ ]+((b|B)illion|(m|M)illion)?|((T|t)rillion)?|((Q|q)uadrillion)?)([\\s]*\\[http:\\/\\/[^\\]]+\\].*)/", $o, $match)) { $o = trim($match[1]); } } if ($match) { list($o, $o_is, $dtype, $lang) = parseAttributeValue($o, $s, $p); if ($o !== NULL) { writeTripel($s, $p, $o, 'main', $o_is, $dtype, $lang); } return true; } return false; }