/** * All of the following names will be encoded to 'Émile Zola': * '%C3%89mile_Zola', '%C3%A9mile_Zola', ' %C3%A9mile Zola ', ' %C3%A9mile _ Zola ', ' Émile _ Zola ' * * TODO: maybe we should expect (require) the name to be normalized, e.g. with uppercase * first letter and without duplicate spaces or spaces at start or end? * Would make this method much simpler. * * @param $name encoded MediaWiki page name, e.g. '%C3%89mile_Zola'. * Must not include the namespace (e.g. 'Template:'). */ public static function wikiDecode($name) { PhpUtil::assertString($name, 'name'); // make first character uppercase $name = StringUtil::mb_ucfirst(self::cleanSpace(urldecode($name))); return $name; }
/** * @param $link MediaWiki link target */ public static function parse($link) { PhpUtil::assertString($link, 'link'); self::init(); $link = urldecode($link); if (strpos($link, '#') !== false) { throw new WikiParserException('Invalid title: "' . $link . '" (Contains #)'); } $parts = explode(':', $link, 2); if (count($parts) === 2) { $prefix = mb_strtolower(WikiUtil::cleanSpace($parts[0])); // TODO: handle interwiki links like [[:de:Foo]] if (strlen($prefix) === 0) { throw new WikiParserException('cannot handle link [' . $link . ']'); } // TODO: handle special prefixes, e.g. [[q:Foo]] links to WikiQuotes if (isset(self::$nsCodes[$prefix])) { $code = self::$nsCodes[$prefix]; $name = StringUtil::mb_ucfirst(WikiUtil::cleanSpace($parts[1])); return new WikiTitle($code, $name); } } $name = StringUtil::mb_ucfirst(WikiUtil::cleanSpace($link)); return new WikiTitle(self::NS_MAIN, $name); }
public function extract($node, $subjectUri, $pageContext) { $this->curl = curl_init(); $url = sprintf(LocalConfiguration::abstractPageUrlFormat, $this->language, $node->getRoot()->getTitle()->encoded()); curl_setopt($this->curl, CURLOPT_URL, $url); curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($this->curl, CURLOPT_BINARYTRANSFER, true); curl_setopt($this->curl, CURLOPT_FAILONERROR, true); $text = curl_exec($this->curl); if (false === $text) { //echo curl_error($this->curl); // TODO /* ('error', 'nr '.curl_errno($this->curl). "\n".curl_error($this->curl)); $this->log('error', $this->getExtractorID().' call to '.$url. ' failed'); $this->log('error', 'Please define AbstractExtractor.page_url_format correctly in your option file, e.g. dbpedia.ini.'); $this->log('error', 'nr '.curl_errno($this->curl). "\n".curl_error($this->curl)); // die($this->getExtractorID().' call to '.$url. ' failed'); echo($this->getExtractorID().' call to '.$url. ' failed'); return new ExtractionResult($pageID, $this->language, $this->getExtractorID()); */ } if (!empty($text)) { $text = StringUtil::htmlDecode($text); $shorttext = $this->short($text); if (!empty($shorttext)) { $quad = new RdfQuad($subjectUri, $this->ontology->getProperty(self::SHORT_PROPERTY), $shorttext, $node->getSourceUri()); $this->destination->addQuad($quad); } $quad = new RdfQuad($subjectUri, $this->ontology->getProperty(self::LONG_PROPERTY), $text, $node->getSourceUri()); $this->destination->addQuad($quad); } return true; }
public function parse(Node $node) { $result = self::nodeToString($node); //Clean text $result = WikiUtil::removeWikiEmphasis($result); $result = StringUtil::htmlDecode($result); if (strlen($result) > 0) { return $result; } }
public function generate($baseUri, $node) { if (isset($node)) { //Retrieve text $text = $this->nodeToText($node); //Normalize text $text = WikiUtil::removeWikiEmphasis($text); $text = StringUtil::htmlDecode($text); $text = preg_replace('/ +/', ' ', $text); //remove duplicate spaces $text = str_replace('(', ' ', $text); $text = str_replace(')', ' ', $text); $text = strip_tags($text); $text = substr($text, 0, 50); $text = trim($text); $text = str_replace(' ', '_', $text); $text = urlencode($text); //Test if the base URI ends with a prefix of text $baseLen = strlen($baseUri); $textLen = strlen($text); for ($i = $baseLen - 1; $i > 0 && $baseLen - $i < $textLen; $i--) { if (substr_compare($baseUri, $text, $i, $textLen, true) === 0) { $text = substr($text, $baseLen - $i); break; } } //Remove leading underscore if (!empty($text) && $text[0] === '_') { $text = substr($text, 1); } //Generate URI $uri = $baseUri . '__' . $text; } else { $uri = $baseUri; } //Resolve collisions if (!isset($this->uris[$uri])) { //No collision $this->uris[$uri] = 1; } else { //Collision found $index = $this->uris[$uri]; $this->uris[$uri] = $index + 1; $uri .= '__' . $index; } return $uri; }
/** * @param $baseDir must end with a directory separator (slash or backslash) * @param $skipNames names (not paths) of files and directories to skip, e.g. '.svn'. * If not given, all files and directories will be included. * @param $paths array of strings, paths of files to use, relative to base dir, * using forward slashes. If not given, all files and directories will be included. */ public function __construct($baseDir, $skipNames = null, $paths = null) { PhpUtil::assertString($baseDir, 'base dir'); $baseDir = str_replace('\\', '/', realpath($baseDir)); if (!is_dir($baseDir)) { throw new \InvalidArgumentException('base dir must be an existing directory, but is ' . $baseDir); } // make sure that $baseDir ends with / if (!StringUtil::endsWith($baseDir, '/')) { $baseDir .= '/'; } if ($skipNames !== null) { PhpUtil::assertArray($skipNames, 'skip names'); } else { $skipNames = array(); } if ($paths !== null) { PhpUtil::assertArray($paths, 'paths'); } $this->baseDir = $baseDir; $this->skipNames = $skipNames; $this->paths = $paths; }
private function templateMappingPath() { if ($this->parsedTitle->nsCode() !== LocalConfiguration::templateMappingNsCode) { return false; } $path = $this->parsedTitle->encoded(); $suffix = LocalConfiguration::templateMappingSuffix; if (!StringUtil::endsWith($path, $suffix)) { return false; } return substr($path, 0, -strlen($suffix)); }
/** * Hack that cuts off 'dbpedia/' from the start of the string * or replaces the first '/' by a ':', so 'dbpedia/Person' becomes 'Person' * and 'foaf/name' becomes 'foaf:name'. * @param $name wiki-encoded page title * @return $name without 'dbpedia/' prefix or with first '/' replaced by a ':'. * @throws InvalidArgumentException if $name does not include a '/' */ public static function getName($name) { if (StringUtil::startsWith($name, 'dbpedia/')) { return substr($name, 8); } else { $slash = strpos($name, '/'); if ($slash === false) { throw new \InvalidArgumentException('missing namespace in page title ' . $name); } else { $name[$slash] = ':'; return $name; } } }