Exemplo n.º 1
0
 /**
  * All of the following names will be encoded to 'Émile Zola': 
  * '%C3%89mile_Zola', '%C3%A9mile_Zola', ' %C3%A9mile Zola ', ' %C3%A9mile _ Zola ', '  Émile _ Zola  '
  * 
  * TODO: maybe we should expect (require) the name to be normalized, e.g. with uppercase
  * first letter and without duplicate spaces or spaces at start or end? 
  * Would make this method much simpler.
  *   
  * @param $name encoded MediaWiki page name, e.g. '%C3%89mile_Zola'.
  * Must not include the namespace (e.g. 'Template:').
  */
 public static function wikiDecode($name)
 {
     PhpUtil::assertString($name, 'name');
     // make first character uppercase
     $name = StringUtil::mb_ucfirst(self::cleanSpace(urldecode($name)));
     return $name;
 }
Exemplo n.º 2
0
 /**
  * @param $link MediaWiki link target
  */
 public static function parse($link)
 {
     PhpUtil::assertString($link, 'link');
     self::init();
     $link = urldecode($link);
     if (strpos($link, '#') !== false) {
         throw new WikiParserException('Invalid title: "' . $link . '" (Contains #)');
     }
     $parts = explode(':', $link, 2);
     if (count($parts) === 2) {
         $prefix = mb_strtolower(WikiUtil::cleanSpace($parts[0]));
         // TODO: handle interwiki links like [[:de:Foo]]
         if (strlen($prefix) === 0) {
             throw new WikiParserException('cannot handle link [' . $link . ']');
         }
         // TODO: handle special prefixes, e.g. [[q:Foo]] links to WikiQuotes
         if (isset(self::$nsCodes[$prefix])) {
             $code = self::$nsCodes[$prefix];
             $name = StringUtil::mb_ucfirst(WikiUtil::cleanSpace($parts[1]));
             return new WikiTitle($code, $name);
         }
     }
     $name = StringUtil::mb_ucfirst(WikiUtil::cleanSpace($link));
     return new WikiTitle(self::NS_MAIN, $name);
 }
Exemplo n.º 3
0
 public function extract($node, $subjectUri, $pageContext)
 {
     $this->curl = curl_init();
     $url = sprintf(LocalConfiguration::abstractPageUrlFormat, $this->language, $node->getRoot()->getTitle()->encoded());
     curl_setopt($this->curl, CURLOPT_URL, $url);
     curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, true);
     curl_setopt($this->curl, CURLOPT_BINARYTRANSFER, true);
     curl_setopt($this->curl, CURLOPT_FAILONERROR, true);
     $text = curl_exec($this->curl);
     if (false === $text) {
         //echo curl_error($this->curl);
         // TODO
         /*
                 ('error', 'nr '.curl_errno($this->curl). "\n".curl_error($this->curl));
                     $this->log('error', $this->getExtractorID().' call to '.$url. ' failed');
         			$this->log('error', 'Please define AbstractExtractor.page_url_format correctly in your option file, e.g. dbpedia.ini.');
         			$this->log('error', 'nr '.curl_errno($this->curl). "\n".curl_error($this->curl));
         			// die($this->getExtractorID().' call to '.$url. ' failed');
         			echo($this->getExtractorID().' call to '.$url. ' failed');
         			return new ExtractionResult($pageID, $this->language, $this->getExtractorID());
         */
     }
     if (!empty($text)) {
         $text = StringUtil::htmlDecode($text);
         $shorttext = $this->short($text);
         if (!empty($shorttext)) {
             $quad = new RdfQuad($subjectUri, $this->ontology->getProperty(self::SHORT_PROPERTY), $shorttext, $node->getSourceUri());
             $this->destination->addQuad($quad);
         }
         $quad = new RdfQuad($subjectUri, $this->ontology->getProperty(self::LONG_PROPERTY), $text, $node->getSourceUri());
         $this->destination->addQuad($quad);
     }
     return true;
 }
Exemplo n.º 4
0
 public function parse(Node $node)
 {
     $result = self::nodeToString($node);
     //Clean text
     $result = WikiUtil::removeWikiEmphasis($result);
     $result = StringUtil::htmlDecode($result);
     if (strlen($result) > 0) {
         return $result;
     }
 }
Exemplo n.º 5
0
 public function generate($baseUri, $node)
 {
     if (isset($node)) {
         //Retrieve text
         $text = $this->nodeToText($node);
         //Normalize text
         $text = WikiUtil::removeWikiEmphasis($text);
         $text = StringUtil::htmlDecode($text);
         $text = preg_replace('/ +/', ' ', $text);
         //remove duplicate spaces
         $text = str_replace('(', ' ', $text);
         $text = str_replace(')', ' ', $text);
         $text = strip_tags($text);
         $text = substr($text, 0, 50);
         $text = trim($text);
         $text = str_replace(' ', '_', $text);
         $text = urlencode($text);
         //Test if the base URI ends with a prefix of text
         $baseLen = strlen($baseUri);
         $textLen = strlen($text);
         for ($i = $baseLen - 1; $i > 0 && $baseLen - $i < $textLen; $i--) {
             if (substr_compare($baseUri, $text, $i, $textLen, true) === 0) {
                 $text = substr($text, $baseLen - $i);
                 break;
             }
         }
         //Remove leading underscore
         if (!empty($text) && $text[0] === '_') {
             $text = substr($text, 1);
         }
         //Generate URI
         $uri = $baseUri . '__' . $text;
     } else {
         $uri = $baseUri;
     }
     //Resolve collisions
     if (!isset($this->uris[$uri])) {
         //No collision
         $this->uris[$uri] = 1;
     } else {
         //Collision found
         $index = $this->uris[$uri];
         $this->uris[$uri] = $index + 1;
         $uri .= '__' . $index;
     }
     return $uri;
 }
Exemplo n.º 6
0
 /**
  * @param $baseDir must end with a directory separator (slash or backslash)
  * @param $skipNames names (not paths) of files and directories to skip, e.g. '.svn'. 
  * If not given, all files and directories will be included.
  * @param $paths array of strings, paths of files to use, relative to base dir,
  * using forward slashes. If not given, all files and directories will be included.
  */
 public function __construct($baseDir, $skipNames = null, $paths = null)
 {
     PhpUtil::assertString($baseDir, 'base dir');
     $baseDir = str_replace('\\', '/', realpath($baseDir));
     if (!is_dir($baseDir)) {
         throw new \InvalidArgumentException('base dir must be an existing directory, but is ' . $baseDir);
     }
     // make sure that $baseDir ends with /
     if (!StringUtil::endsWith($baseDir, '/')) {
         $baseDir .= '/';
     }
     if ($skipNames !== null) {
         PhpUtil::assertArray($skipNames, 'skip names');
     } else {
         $skipNames = array();
     }
     if ($paths !== null) {
         PhpUtil::assertArray($paths, 'paths');
     }
     $this->baseDir = $baseDir;
     $this->skipNames = $skipNames;
     $this->paths = $paths;
 }
Exemplo n.º 7
0
 private function templateMappingPath()
 {
     if ($this->parsedTitle->nsCode() !== LocalConfiguration::templateMappingNsCode) {
         return false;
     }
     $path = $this->parsedTitle->encoded();
     $suffix = LocalConfiguration::templateMappingSuffix;
     if (!StringUtil::endsWith($path, $suffix)) {
         return false;
     }
     return substr($path, 0, -strlen($suffix));
 }
Exemplo n.º 8
0
 /**
  * Hack that cuts off 'dbpedia/' from the start of the string 
  * or replaces the first '/' by a ':', so 'dbpedia/Person' becomes 'Person'
  * and 'foaf/name' becomes 'foaf:name'.
  * @param $name wiki-encoded page title
  * @return $name without 'dbpedia/' prefix or with first '/' replaced by a ':'.
  * @throws InvalidArgumentException if $name does not include a '/'
  */
 public static function getName($name)
 {
     if (StringUtil::startsWith($name, 'dbpedia/')) {
         return substr($name, 8);
     } else {
         $slash = strpos($name, '/');
         if ($slash === false) {
             throw new \InvalidArgumentException('missing namespace in page title ' . $name);
         } else {
             $name[$slash] = ':';
             return $name;
         }
     }
 }