Example #1
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $pageID = encodeLocalName($pageID);
     // Remove Template as this is already extracted by the Infobox Extractor
     // Find subtemplates and remove Subtemplates, which are listed as ignored!
     preg_match_all('~\\{((?>[^{}]+)|(?R))*\\}~x', $pageSource, $subTemplates);
     foreach ($subTemplates[0] as $key => $subTemplate) {
         $subTemplate = preg_replace("/(^\\{\\{)|(\\}\\}\$)/", "", $subTemplate);
         // Cut Brackets / {}
         $pageSource = str_replace('{{' . $subTemplate . '}}', '', $pageSource);
     }
     // Extract internal Semantic Links
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-z0-9\\- _]+)(::)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate(encodeLocalName($match[2])), RDFtriple::page($match[4]));
     }
     // Extract Literals
     $findSemanticLinks = preg_match_all('/(\\[\\[)([a-zA-Z\\-_ ]+)(:=)([^\\]]+)\\]\\]/', $pageSource, $matches, PREG_SET_ORDER);
     foreach ($matches as $match) {
         $triple = array();
         $triple = parseAttributeValue($match[4], $pageID, $match[2]);
         // object, object_is, datatype(, language)
         $lexicalForm = $triple[0];
         $datatype = $triple[2];
         $predicate = propertyToCamelCase(encodeLocalName($match[2]));
         // Continue if empty String
         if ($lexicalForm == null) {
             continue;
         }
         $result->addTriple(RDFtriple::page($pageID), RDFTriple::predicate($predicate), RDFtriple::literal($lexicalForm, $datatype, 'en'));
     }
     return $result;
 }
Example #2
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $pageID = encodeLocalName($pageID);
     // Extract Wikipedia Link
     if (preg_match('/\\{\\{wikipedia\\-c(\\-note)?\\}\\}/', $pageSource)) {
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://www.w3.org/2002/07/owl#sameAs"), RDFtriple::URI("http://dbpedia.org/resource/" . $pageID));
     }
     return $result;
 }
 public function generate($subjectName, $propertyName, $value)
 {
     $result = array();
     $links = $this->parser->parse($value);
     foreach ($links as $link) {
         $link = $this->mediaWikiUtil->toCanonicalWikiCase($link);
         $link = encodeLocalName($link);
         $resource = $this->basePath . $link;
         $result[] = new RDFtriple(RDFtriple::page($subjectName), RDFtriple::URI(DB_ONTOLOGY_NS . $propertyName), RDFtriple::URI($resource));
     }
     return $result;
 }
Example #4
0
/**
 * Writes the triple + additional information such as language, whether an object is a reference
 * or a literal and the datatype into a global array ($parseResult)
 *
 * @param subject: String containing the triples subject
 * @param predicate: String containing the triples predicate
 * @param object: String containing the triples object
 * @param file: Legacy, should be removed in the future
 * @param object_is: 'r' if object is a reference, 'l' if object is a literal, 'b' if object is a blanknode
 * @param dtype: String containing a literals XS D:datatype
 * @param lang: String containing a literals language
 *
 * TODO: Should encodeLocalName be used for the whole URL? Should URI objects be used?
 *
 */
function writeTripel($subject, $predicate, $object, $file = 'main', $object_is = 'r', $dtype = NULL, $lang = NULL)
{
    global $parseResult;
    if ($object_is == 'r' && !URI::validate(encodeLocalName($object))) {
        return null;
    }
    // If $object_is == 'l', encodeLocalName shouldn't be used, the string will be encoded like e.g. \uBC18\uC57C
    if ($object_is != 'l') {
        $object = encodeLocalName($object);
    }
    $predicate = encodeLocalName($predicate);
    if (USE_PERCENT_ENCODING) {
        $predicate = str_replace("%", "_percent_", $predicate);
    } else {
        if (ereg("%([A-F0-9]{2})", substr($predicate, -3))) {
            $predicate .= "_";
        }
    }
    $parseResult[] = array(encodeLocalName($subject), $predicate, $object, $object_is, $dtype, $lang);
}
Example #5
0
 /**
  * This function returns two results:
  * [triples: the generated triples, metaTriples: meta triples 2: the used templates]
  *
  * Unfortunately I haven't renamed the variables yet - result
  * is the array containing the result TRIPLES!! - its not this
  * 2 element.
  *
  */
 private function myGenerate(BreadCrumb $breadCrumb, $value)
 {
     // result is the array containing: triples, meta triples, used templates
     $result = array(array(), array(), array());
     $rootSubjectUri = RDFTriple::page($breadCrumb->getRoot());
     // this array is only relevant on depth 0
     $relatedClasses = array();
     //$metaTriples = array();
     //$usedTemplateNames = array();
     // 'parent' means the parent of the value - thus subject and predicate
     $parentName = $this->breadCrumbTransformer->transform($breadCrumb);
     $parentResource = RDFTriple::page($parentName);
     $parentPropertyName = null;
     $tmp = $breadCrumb->peekTop(0);
     if (isset($tmp)) {
         $parentPropertyName = $tmp->getPropertyName();
     }
     // Get all templates on this site, indexed by name
     // (there may be multiple templates with the same name)
     $nameToTemplates = SimpleWikiTemplateMatcher::match($value);
     //print_r($value);
     //print_r($nameToTemplates);
     //echo "NOW COMES THE STORM\n";
     foreach ($nameToTemplates as $templateName => $templates) {
         if (strlen($templateName) < 1) {
             continue;
         }
         //echo "GOT TEMPLATE NAME $templateName\n";
         $templateName = $this->mediaWikiUtil->toCanonicalWikiCase($templateName);
         if (!$this->templateNameFilter->doesAccept($templateName)) {
             continue;
         }
         $templateUri = RDFTriple::URI(DB_TEMPLATE_NS . $templateName, false);
         $result[2][$templateName] = 1;
         // Get annotations for the template - if there are any
         $lookupName = "Template:{$templateName}/doc";
         if ($breadCrumb->getDepth() == 0) {
             $ta = $this->templateDb->getTemplateAnnotation($lookupName);
             // Create the triples for "relatesToClass"
             // But only for the page itself (not for sub templates)
             // if no related class exists, default to rdf:type owl:Thing
             if (isset($ta)) {
                 foreach ($ta->getRelatedClasses() as $item) {
                     $relatedClasses[$item] = 1;
                 }
             }
         }
         foreach ($templates as $templateIndex => $template) {
             //echo "GOT TEMPLATE INDEX $templateIndex\n";
             // Iterate over all arguments
             $arguments = $template->getArguments();
             foreach ($arguments as $argumentName => $values) {
                 //echo "GOT ARGUMENT NAME $argumentName\n";
                 // propertyNs defaults to DB_PROPERTY_NS unless there
                 // exists a mapping in the templatedb. In that case it will
                 // be set to DB_ONTOLOGY_NS
                 $propertyNs = DB_PROPERTY_NS;
                 $pa = null;
                 if (isset($ta)) {
                     $pas = $ta->getPropertyAnnotations();
                     if (array_key_exists($argumentName, $pas)) {
                         $pa = $pas[$argumentName];
                         $propertyNs = DB_ONTOLOGY_NS;
                     }
                 }
                 //print_r($ta);
                 //echo "PROPERTY NS : $lookupName - $argumentName = $propertyNs\n";
                 // Fake a property mapping if there was none in the db
                 // This maps argumentName back to iteself
                 if (!isset($pa)) {
                     // If there was no mapping we might ignore it
                     // depending on an option (We can prevent this extractor
                     // to generate triples with properties in the
                     // dbp:property namespace
                     // We allow such triples on subResources though.
                     if ($this->allowUnmappedProperties != true && $breadCrumb->getDepth() == 0) {
                         continue;
                     }
                     // If there was no mapping, also rename numeric
                     // argument names (e.g. 1 becomes property1)
                     // this is just cosmetic for the result
                     if (is_numeric($argumentName)) {
                         $argumentName = "property{$argumentName}";
                     }
                     $pa = new PropertyAnnotation($argumentName);
                     $pa->addMapping(new PropertyMapping($argumentName));
                 }
                 foreach ($pa->getMappings() as $pm) {
                     $parseHint = $pm->getParseHint();
                     //echo "Mapping $argumentName : {$pm->getRenamedValue()}\n\n";
                     // if the renamed value is not set, use the original
                     // name
                     // otherwise use the mapped value
                     if (!isEmptyString($pm->getRenamedValue())) {
                         $argumentName = $pm->getRenamedValue();
                     }
                     $argumentName = trim($argumentName);
                     //echo "Mapping $argumentName : {$pm->getRenamedValue()}\n\n";
                     // Skip empty properties
                     // FIXME does that even happen?
                     if (strlen($argumentName) < 1) {
                         continue;
                     }
                     //echo "TN = $templateName, AN = $argumentName\n";
                     $childBreadcrumb = $breadCrumb->createClone();
                     $childBreadcrumb->push(new BreadcrumbNode($templateName, $templateIndex, $argumentName));
                     //$templateChildName = $this->breadcrumbToSubject($childBreadcrumb);
                     $templateChildName = $this->breadCrumbTransformer->transform($childBreadcrumb);
                     // If there is no parse hint we might be able to derive it
                     if (!isset($parseHint)) {
                         $parseHint = $this->deriveParseHintFromName($argumentName);
                     }
                     // Attempt to obtain a triple generator
                     $tripleGenerator = $this->getTripleGenerator($parseHint);
                     // If we DONT have a triple generator
                     // we fall through to default handling
                     $localResult = array(array(), array(), array());
                     if (isset($tripleGenerator)) {
                         foreach ($values as $valueIndex => $value) {
                             //echo "GOT VALUE $value\n";
                             $value = trim($value);
                             // Skip empty values
                             if ($value == "") {
                                 continue;
                             }
                             //echo "PROCESSING $templateChildName - $argumentName $value\n";
                             $tmp = $tripleGenerator->generate($templateChildName, $argumentName, $value);
                             $localResult[0] = array_merge($localResult[0], $tmp);
                             //echo "LOCALRESULT\n";
                             //print_r($localResult[0]);
                             //print_r($triples);
                             //echo "\nSigh\n";
                             //if(isset($triples))
                             //    $result = array_merge($result, $triples);
                         }
                         // append the generated triples
                         //continue;
                     } else {
                         // No parse hint - default handling
                         // if property date and object an timespan
                         // we extract it with following special case
                         $argumentName = propertyToCamelCase($argumentName);
                         $argumentName = encodeLocalName($argumentName);
                         if (in_array($argumentName, $GLOBALS['W2RCFG']['ignoreProperties'])) {
                             continue;
                         }
                         // turn the argument name into a property name
                         $propertyName = $propertyNs . $argumentName;
                         foreach ($values as $valueIndex => $value) {
                             $value = trim($value);
                             // Skip empty values
                             if ($value == "") {
                                 continue;
                             }
                             if ($argumentName == "date") {
                                 $value = str_replace("[", "", $value);
                                 $value = str_replace("]", "", $value);
                                 $value = str_replace("&ndash;", "-", $value);
                             }
                             // Parse out sub templates
                             // if something was extracted:
                             // .) connect subject with subsubject
                             // .) indicate usage at wikipage
                             $subResources = $this->myGenerate($childBreadcrumb, $value);
                             for ($i = 0; $i < 3; ++$i) {
                                 $localResult[$i] = array_merge($localResult[$i], $subResources[$i]);
                             }
                             //$result = array_merge($result, $triples);
                             //echo "GOT OBJECT $value\n";
                             $localResult[0] = array_merge($localResult[0], parseAttributeValueWrapper($value, $templateChildName, $propertyName, $this->language));
                             //$result = array_merge($result, $triples);
                         }
                     }
                     // For each triple add the ExtractedFromTemplate-Annotation
                     // Exclude triples with wikiPageUsesTemplate as predicate though
                     foreach ($localResult[0] as $triple) {
                         $triple->addExtractedFromTemplateAnnotation($templateUri);
                     }
                     // Add on delete cascade annotation
                     if ($breadCrumb->getDepth() > 1) {
                         foreach ($localResult[0] as $triple) {
                             $triple->addOnDeleteCascadeAnnotation($rootSubjectUri);
                         }
                     }
                     // merge the results
                     //for($i = 0; $i < 3; ++$i)
                     //    $result[$i] = array_merge($result[$i], $localResult[$i]);
                     //}
                     for ($i = 0; $i < 3; ++$i) {
                         $result[$i] = array_merge($result[$i], $localResult[$i]);
                     }
                 }
             }
             /*
             How to connect a sub-subject to the root subject?
                             if($breadCrumb->getDepth() == 0)
                continue;
             
                             // Create the parent-child connection
                             $parentChildTriple = new RDFtriple(
                $parentResource,
                RDFtriple::URI(DB_PROPERTY_NS . encodeLocalName($parentPropertyName), false),
                RDFtriple::page($templateChildName));
             
                             //$result[1][] = $parentChildTriple;
             */
         }
     }
     if (count($relatedClasses) > 0) {
         foreach ($relatedClasses as $relatedClass => $dummy) {
             $result[1][] = new RDFtriple($parentResource, RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(DB_ONTOLOGY_NS . $relatedClass, false));
         }
     } else {
         if ($breadCrumb->getDepth() == 0) {
             $result[1][] = new RDFtriple($parentResource, RDFtriple::URI(RDF_TYPE, false), RDFtriple::URI(OWL_THING, false));
         }
     }
     // Add the wiki page uses template triples - but only on depth 0
     if ($breadCrumb->getDepth() == 0) {
         foreach ($result[2] as $name => $dummy) {
             $result[1][] = new RDFTriple($parentResource, self::$wikiPageUsesTemplateUri, RDFTriple::URI(DB_TEMPLATE_NS . $name, false));
         }
     }
     $n = count($result[0]) + count($result[1]);
     $this->log(TRACE, "Generated a total of {$n} triples at {$breadCrumb}");
     foreach ($result[0] as $item) {
         $this->log(TRACE, $item);
     }
     foreach ($result[1] as $item) {
         $this->log(TRACE, $item);
     }
     return $result;
 }
/**
 * Parses internal Links:
 * - If a Link is found: links to currencies are replaced with the respective symbol, external links are removed
 * (these are usually references), links to dates are removed (if more than one link was found).
 * - If only digits and currencies are at the beginning of the String, anything else is removed and the number
 * is parsed for it's type (int, float, unit)
 * - In any other cases, where internal links are mixed with text, the function compares the aggregated word-length
 * of the links, with the length of text items. If the links are longer, the String is parsed as a link list, else 
 * the brackets are removed and the String is recognized as text.
 * 
 * 
 * 
 */
function catchLinkList(&$o, $s, $p, &$dtype)
{
    //	Match for any Link
    $foundLink = preg_match_all("/(\\[{2})([^\\]]+)(\\]{2})/", $o, $matches);
    if (!$foundLink) {
        return false;
    }
    // Initialize object-type with literal
    $object_is = 'l';
    // echo "\n$o";
    // Test whether property is included in known Linklists and parse Links
    $knownLinkLists = $GLOBALS['linklistpredicates'];
    // Remove DBpedia Base URI
    $propertyName = substr($p, strlen($GLOBALS['W2RCFG']['propertyBase']), strlen($p));
    // Compare property-name with known LinkList properties
    foreach ($knownLinkLists as $linkList) {
        if ($linkList == $propertyName) {
            preg_match_all("/(\\[{2})([^\\]]+)(\\]{2})/", $o, $matches);
            foreach ($matches[2] as $l) {
                if (strlen($l) > 1) {
                    // Extract internal links of type [[abc|def]]
                    $pos = stripos($l, "|");
                    if ($pos) {
                        $l = substr($l, 0, $pos);
                    }
                    $object = $GLOBALS['W2RCFG']['wikipediaBase'] . ucwords(encodeLocalName($l));
                    $object_is = 'r';
                    writeTripel($s, $p, $object, 'main', $object_is);
                    unset($object);
                }
            }
            return true;
        }
    }
    // $weight: If text is mixed with length, this is the weight assigned to the links
    // in order to decide whether the composite link/text String is parsed as link-list or text-litearal
    // any value > 1, gives more weight to links, any value between 0 and 1, morr to the text part
    $weight = 1.25;
    //	If an internal Link was found:
    // Replace Links to currencies with the respective Symbol
    $currencies = array("U.S. (D|d)ollar" => "\$", "United States (D|d)ollar" => "\$", "Dollar" => "\$", "Euro" => "€", "Yen" => "¥", "Pound" => "£");
    // $z = str_replace('$','\$',$o);
    foreach ($currencies as $key => $currency) {
        // Do not match real Links to currencies e.g. United_States: currency = [[United States Dollar]] ($)
        if (preg_match('/^\\s*\\[{2}' . $key . '\\s?\\|?[^\\]]*\\]{2}[\\(\\s ]*' . $currency . '[\\)\\s]*$/', $o)) {
            break;
        }
        $o = trim(preg_replace('/(^[^' . $currency . ']*)(\\[{2}' . $key . '\\s?\\|?[^\\]]*\\]{2})/', '\\1' . $currency, $o));
        // Old Version
        // $o = trim(preg_replace("/\[{2}".$key."\s?\|?[^\]]*\]{2}(^$)/",$currency,$o));
    }
    // Remove External Links (these are usually references)
    $o = trim(preg_replace("/\\[http:\\/\\/[^\\]]+\\]/", "", $o));
    //	Remove any Links between parentheses
    //	Remove links in parentheses. Bug: Destroys Links with "()" inside an internal Link. e.g. Boris_Becker: birthplace
    $o = trim(preg_replace("/\\([^\\[\\]]*\\[{2}[^\\)\\]]*\\]{2}[^\\)]*\\)/", "", $o));
    //	If Link is a Date and more than one Link was found, remove Link
    if ($foundLink > 1) {
        $months = array("January", "February", "March", "April", "May", "June", "July", "August", "September", "November", "December");
        foreach ($months as $month) {
            // $o = trim(preg_replace("/\[{2}$month [0-9]{1,2}\]{2},?[\s]*,?/","",$o));
            $o = trim(preg_replace("/\\[{2}{$month} [0-9]{1,2}\\]{2},?[\\s]*(,?[\\s]*(\\(?\\[{2}|\\()[0-9]{4}(\\]{2}\\)?|\\)))?/", "", $o));
        }
        // If Link is a year, remove Link
        $o = trim(preg_replace("/\\(?\\[{1,2}[0-9]{4}\\]{1,2}\\)?/", "", $o));
    }
    // Initialize ResultString
    $resultstring = "";
    // String begins with Text and is followed by one or more Links
    if (preg_match("/^([^\\[]+)(\\[{2})*/", $o, $stringStart)) {
        // String ends with a Link -> this means String is like: "abc [[def | jjj ]] ghi [[ xyz ]]"  (Problem "abc [[def]][[xyz]]")
        if (preg_match("/\\]{2}\$/", $o)) {
            // Match Text, followed by a Link
            $found = preg_match_all("/([^\\[\\]]+)(\\[{2})([^\\]]+)(\\]{2})/", $o, $matches);
            $linkPos = 3;
            // Position of Links in $matches
            $textPos = 1;
            // Position of Text in $matches
            // String ends with Text -> this means String is like: "abc [[def | jjj ]] ghi [[ xyz ]] klm"
        } else {
            // Initialize ResultString with "abc "
            $resultstring = $stringStart[1];
            // Match Link, followed by Text
            $found = preg_match_all("/(\\[{2})([^\\]]+)(\\]{2})([^\\[]+)/", $o, $matches);
            $linkPos = 2;
            // Position of Links in $matches
            $textPos = 4;
            // Position of Text in $matches
        }
        // String starts with numbers and/or currency Symbols
        if (preg_match("/(^[\\s]*([0-9\$€£¥]+[\\.,][0-9\$€£¥]+|[0-9\$€£¥]+)[\\s]*(((B|b)illion)?|((M|m)illion)?|((T|t)rillion)?|((Q|q)uadrillion)?))(.*)/", $o, $numberMatch) && strlen(trim($numberMatch[1])) > 2 || preg_match("/^([0-9\$€£¥]+[\\.,][0-9\$€£¥]+|[0-9\$€£¥]+)([\\s]*\$)/", $o)) {
            // CodeBlock for parsing Numbers
            // Remove any remaining Links
            // $o = preg_replace("/\[{2}[^\]]+\]{2}/","",$o);
            // echo "\n$o";
            // Read Links to numbers, e.g., BMW: revenue => € 4.9 [[10000000 (number)| billion]]
            if (preg_match("/(^[\\s]*([0-9\$€£¥]+[\\.,][0-9\$€£¥]+|[0-9\$€£¥]+)[\\s]*)(\\[{2}[^\\]\\|]*\\(number\\)[^\\]\\|]*\\|)([^\\]]+)(\\]{2})(.*\$)/", $o, $numberMatch)) {
                $o = trim($numberMatch[1]) . " " . trim($numberMatch[4]);
            } else {
                if ($startPos = strpos($o, "[[")) {
                    // Remove anything after first Link
                    $o = substr($o, 0, $startPos);
                }
            }
            // Remove year: e.g. p:revenue = "22 billion $ (2004) => "22 billion $";
            if (preg_match("/^[\\s]*[0-9\$€£¥]+[^\\(]+\\([0-9]{4}\\).*/", $o)) {
                $o = trim(preg_replace("/\\([0-9]{4}\\)/", "", $o));
            }
            $o = trim($o);
            // echo "\n parsing for values $o";
            list($o, $o_is, $dtype, $lang) = parseAttributeValue($o, $s, $p);
            if ($o !== NULL) {
                writeTripel($s, $p, $o, 'main', $o_is, $dtype, $lang);
            }
            return true;
        } else {
            // Calculate aggregate length of text and Links
            $lengthLink = 0;
            $lengthText = strlen($resultstring);
            foreach ($matches[$linkPos] as $match) {
                if ($pos = strpos($match, "|")) {
                    $lengthLink += strlen(preg_replace("/\\s/", "", substr($match, $pos, strlen($match) - $pos)));
                } else {
                    $lengthLink += strlen(preg_replace("/\\s/", "", $match));
                }
            }
            foreach ($matches[$textPos] as $match) {
                $lengthText += strlen(preg_replace("/\\s/", "", $match));
            }
            // compare aggregated length of links and literals ($weight is defined at the top of this function)
            if ($weight * $lengthLink >= $lengthText) {
                // CodeBlock for Start with Text and more Links than Text
                // echo "Start with Text: more Links ($lengthLink,$lengthText)";
                foreach ($matches[$linkPos] as $l) {
                    if (strlen($l) > 1) {
                        // Extract internal links of type [[abc|def]]
                        $pos = stripos($l, "|");
                        if ($pos) {
                            $l = substr($l, 0, $pos);
                        }
                        $object = $GLOBALS['W2RCFG']['wikipediaBase'] . ucwords(encodeLocalName($l));
                        $object_is = 'r';
                        writeTripel($s, $p, $object, 'main', $object_is);
                        unset($object);
                    }
                }
                return true;
            } else {
                // CodeBlock for Start with Text and more Text than Links
                // echo "Start with Text: more Text ($lengthLink,$lengthText) ($o)";
                // Replace Links with their Labels
                $o = preg_replace_callback("/(\\[{2}[^\\|^\\]]+)(\\|)([^\\]]+)(\\]{2})/", 'getLabelForLink', $o);
                // Replace simple links with their link-text
                $o = preg_replace("/\\[{2}|\\]{2}/", "", $o);
                writeTripel($s, $p, $o, 'main', $object_is);
                return true;
            }
        }
        // String begins with Links and is followed by Text or Links)
    } else {
        if (preg_match("/^(\\[{2})([^\\]]+)(\\]{2})/", $o, $stringStart)) {
            // String ends with a Link -> this means String is like: "[[def | jjj ]] ghi [[ xyz ]]"
            if (preg_match("/\\]{2}\$/", $o)) {
                // Initialize ResultString with "[[def | jjj ]]"
                $resultstring = $stringStart[2];
                // Match Text, followed by a Link
                $found = preg_match_all("/([^\\[\\]]+)(\\[{2})([^\\]]+)(\\]{2})/", $o, $matches);
                $linkPos = 3;
                // Position of Links in $matches
                $textPos = 1;
                // Position of Text in $matches
                // String ends with Text -> this means String is like: "[[def | jjj ]] ghi [[ xyz ]] klm"
            } else {
                // Match Link, followed by Text
                $found = preg_match_all("/(\\[{2})([^\\]]+)(\\]{2})([^\\[]+)/", $o, $matches);
                $linkPos = 2;
                // Position of Links in $matches
                $textPos = 4;
                // Position of Text in $matches
            }
            // String is composed only of Links -> this means String is like: "[[abc]][[def]]"
            if (!$found) {
                // CodeBlock for returning only Links
                $found = preg_match_all("/(\\[{2})([^\\]]+)(\\]{2})/", $o, $matches);
                foreach ($matches[2] as $l) {
                    if (strlen($l) > 1) {
                        // Extract internal links of type [[abc|def]]
                        $pos = stripos($l, "|");
                        if ($pos) {
                            $l = substr($l, 0, $pos);
                        }
                        $object = $GLOBALS['W2RCFG']['wikipediaBase'] . ucwords(encodeLocalName($l));
                        $object_is = 'r';
                        writeTripel($s, $p, $object, 'main', $object_is);
                        unset($object);
                    }
                }
                return true;
            } else {
                // Calculate aggregate length of text and Links
                // If String starts and ends with Link, add length of first Link
                $lengthLink = strlen($resultstring);
                if ($lengthLink > 0) {
                    // If first Links of type [[abc | def]] only count "def"
                    if ($pos = strpos($resultstring, "|")) {
                        $lengthLink += strlen(preg_replace("/\\s/", "", substr($resultstring, $pos, strlen($resultstring) - $pos)));
                    }
                }
                $lengthText = 0;
                // add length of current link (in $matches) to aggregate length (if link is like [[abc|def]], only def counts)
                foreach ($matches[$linkPos] as $match) {
                    if ($pos = strpos($match, "|")) {
                        $lengthLink += strlen(preg_replace("/\\s/", "", substr($match, $pos, strlen($match) - $pos)));
                    } else {
                        $lengthLink += strlen(preg_replace("/\\s/", "", $match));
                    }
                }
                // add length of literals to aggregate text-length
                foreach ($matches[$textPos] as $match) {
                    $lengthText += strlen(preg_replace("/\\s/", "", $match));
                }
                // compare aggregated length of links and literals ($weight is defined at the top of this function)
                if ($weight * $lengthLink >= $lengthText) {
                    // CodeBlock for Start with Link and more Links than Text
                    // echo "Start with Link: more Links ($lengthLink,$lengthText) ($o)";
                    if (strlen($resultstring) > 1) {
                        array_unshift($matches[$linkPos], $resultstring);
                    }
                    foreach ($matches[$linkPos] as $l) {
                        if (strlen($l) > 1) {
                            // Extract internal links of type [[abc|def]]
                            $pos = stripos($l, "|");
                            if ($pos) {
                                $l = substr($l, 0, $pos);
                            }
                            $object = $GLOBALS['W2RCFG']['wikipediaBase'] . ucwords(encodeLocalName($l));
                            $object_is = 'r';
                            writeTripel($s, $p, $object, 'main', $object_is);
                            unset($object);
                        }
                    }
                    return true;
                } else {
                    // CodeBlock for Start with Link and more Text than Links
                    // echo "Start with Link: more Text ($lengthLink,$lengthText) ($o)";
                    // Replace Links with their Labels
                    $o = preg_replace_callback("/(\\[{2}[^\\|^\\]]+)(\\|)([^\\]]+)(\\]{2})/", 'getLabelForLink', $o);
                    // Replace simple links with their link-text
                    $o = preg_replace("/\\[{2}|\\]{2}/", "", $o);
                    writeTripel($s, $p, $o, 'main', $object_is);
                    return true;
                }
            }
        }
    }
}
Example #7
0
function printexplicitTyping($name, $filename, $name_is, $object_is = 'n')
{
    static $namearray = array();
    static $predicatetypearray = array();
    if ($name_is == 'c') {
        $save = $name . ':Cat';
    }
    if ($name_is == 't') {
        $save = $name . ':Temp';
    }
    if ($name_is == 'p') {
        $save = $name . ':Pred';
    }
    if (!arrayMultiSearch($save, $namearray[$filename])) {
        $namearray[$filename][] = $save;
        if ($object_is != 'n') {
            $predicatetypearray[$save][$filename]['is'] = $object_is;
        }
        $filedecisionTemplate = $GLOBALS['rdftypeProperty'] != $GLOBALS['W2RCFG']['templateProperty'] ? 'main' : 'type';
        $filedecisionCategory = $GLOBALS['rdftypeProperty'] != $GLOBALS['W2RCFG']['categoryProperty'] ? 'main' : 'type';
        if ($name_is == 'c' && $filedecisionCategory == 'type') {
            writeTripel($name, $GLOBALS['W2RCFG']['categoryProperty'], $GLOBALS['W2RCFG']['classBase'], 'type');
        }
        if ($name_is == 'c' && $filedecisionCategory == 'main') {
            printexplicitTyping($GLOBALS['W2RCFG']['categoryProperty'], $filename, 'p', 'r');
        }
        if ($name_is == 't' && $filedecisionTemplate == 'type') {
            writeTripel($GLOBALS['W2RCFG']['wikipediaBase'] . $GLOBALS['templateLabel'] . ':' . encodeLocalName($name), $GLOBALS['rdftypeProperty'], $GLOBALS['W2RCFG']['classBase'], 'type');
        }
        if ($name_is == 't' && $filedecisionTemplate == 'main') {
            printexplicitTyping($GLOBALS['W2RCFG']['templateProperty'], $filename, 'p', 'r');
        }
        if ($name_is == 'p') {
            writeTripel($name, $GLOBALS['rdftypeProperty'], $object_is == 'l' ? $GLOBALS['W2RCFG']['datatypePropertyBase'] : $GLOBALS['W2RCFG']['objectPropertyBase'], 'type');
        }
        return;
    } else {
        return $predicatetypearray[$save][$filename]['is'];
    }
}