/** * @since 0.4 * * @param string $text * * @return string */ public function getSearchKey($text) { if ($text === null) { return null; } if ($text === '') { return ''; } // composed normal form $nfcText = $this->stringNormalizer->cleanupToNFC($text); if (!is_string($nfcText) || $nfcText === '') { wfWarn("Unicode normalization failed for `{$text}`"); } // WARNING: *any* invalid UTF8 sequence causes preg_replace to return an empty string. // Control character classes excluding private use areas. $strippedText = preg_replace('/[\\p{Cc}\\p{Cf}\\p{Cn}\\p{Cs}]+/u', ' ', $nfcText); // \p{Z} includes all whitespace characters and invisible separators. $strippedText = preg_replace('/^\\p{Z}+|\\p{Z}+$/u', '', $strippedText); if ($strippedText === '') { // NOTE: This happens when there is only whitespace in the string. // However, preg_replace will also return an empty string if it // encounters any invalid utf-8 sequence. return ''; } //TODO: Use Language::lc to convert to lower case. // But that requires us to load ALL the language objects, // which loads ALL the messages, which makes us run out // of RAM (see bug T43103). $normalized = mb_strtolower($strippedText, 'UTF-8'); if (!is_string($normalized) || $normalized === '') { wfWarn("mb_strtolower normalization failed for `{$strippedText}`"); } return $normalized; }
protected function addToOutput(Entity $entity, Status $status, $oldRevId = null) { $this->getResultBuilder()->addBasicEntityInformation($entity->getId(), 'entity'); $this->getResultBuilder()->addRevisionIdFromStatusToResult($status, 'entity', $oldRevId); $params = $this->extractRequestParams(); if (isset($params['site']) && isset($params['title'])) { $normTitle = $this->stringNormalizer->trimToNFC($params['title']); if ($normTitle !== $params['title']) { $this->getResultBuilder()->addNormalizedTitle($params['title'], $normTitle, 'normalized'); } } $this->getResultBuilder()->markSuccess(1); }
/** * Tries to find item id for given siteId and title combination * * @param string $siteId * @param string $title * @param bool $normalize * * @return ItemId|null */ private function getItemId($siteId, $title, $normalize) { // FIXME: This code is duplicated in SpecialItemByTitle::execute! $title = $this->stringNormalizer->trimToNFC($title); $id = $this->siteLinkLookup->getItemIdForLink($siteId, $title); // Try harder by requesting normalization on the external site. if ($id === null && $normalize === true) { $siteObj = $this->siteStore->getSite($siteId); //XXX: this passes the normalized title back into $title by reference... $this->normalizeTitle($title, $siteObj); $id = $this->siteLinkLookup->getItemIdForLink($siteObj->getGlobalId(), $title); } return $id; }
/** * Trims leading and trailing whitespace and performs unicode normalization * by calling Wikibase\StringNormalizer::trimToNFC(). * * @see StringNormalizer::normalize() * @see Wikibase\StringNormalizer::trimToNFC() * * @param string $value the value to normalize * * @throws InvalidArgumentException if $value is not a string * @return string the normalized value */ public function normalize($value) { return $this->normalizer->trimToNFC($value); }