/** * Returns the normalized form of the given page title, using the * normalization rules of the given site. If the given title is a redirect, * the redirect weill be resolved and the redirect target is returned. * * @note This actually makes an API request to the remote site, so beware * that this function is slow and depends on an external service. * * @see Site::normalizePageName * * @since 1.27 * * @param string $pageName * @param string $apiUrl * * @return string * @throws \MWException */ public function normalizePageName($pageName, $apiUrl) { // Check if we have strings as arguments. if (!is_string($pageName)) { throw new \MWException('$pageName must be a string'); } // Go on call the external site // Make sure the string is normalized into NFC (due to T42017) // but do nothing to the whitespaces, that should work appropriately. // @see https://phabricator.wikimedia.org/T42017 $pageName = Validator::cleanUp($pageName); // Build the args for the specific call $args = ['action' => 'query', 'prop' => 'info', 'redirects' => true, 'converttitles' => true, 'format' => 'json', 'titles' => $pageName]; $url = wfAppendQuery($apiUrl, $args); // Go on call the external site // @todo we need a good way to specify a timeout here. $ret = $this->http->get($url, [], __METHOD__); if ($ret === false) { wfDebugLog("MediaWikiSite", "call to external site failed: {$url}"); return false; } $data = FormatJson::decode($ret, true); if (!is_array($data)) { wfDebugLog("MediaWikiSite", "call to <{$url}> returned bad json: " . $ret); return false; } $page = static::extractPageRecord($data, $pageName); if (isset($page['missing'])) { wfDebugLog("MediaWikiSite", "call to <{$url}> returned a marker for a missing page title! " . $ret); return false; } if (isset($page['invalid'])) { wfDebugLog("MediaWikiSite", "call to <{$url}> returned a marker for an invalid page title! " . $ret); return false; } if (!isset($page['title'])) { wfDebugLog("MediaWikiSite", "call to <{$url}> did not return a page title! " . $ret); return false; } return $page['title']; }
* * @file * @ingroup UtfNormal */ use UtfNormal\Validator; if (PHP_SAPI != 'cli') { die("Run me from the command line please.\n"); } require_once dirname(__DIR__) . '/vendor/autoload.php'; define('BENCH_CYCLES', 1); define('BIGSIZE', 1024 * 1024 * 10); // 10m ini_set('memory_limit', BIGSIZE + 120 * 1024 * 1024); $testfiles = array('testdata/washington.txt' => 'English text', 'testdata/berlin.txt' => 'German text', 'testdata/bulgakov.txt' => 'Russian text', 'testdata/tokyo.txt' => 'Japanese text', 'testdata/young.txt' => 'Korean text'); $normalizer = new Validator(); Validator::loadData(); foreach ($testfiles as $file => $desc) { benchmarkTest($normalizer, $file, $desc); } # ------- function benchmarkTest(&$u, $filename, $desc) { print "Testing {$filename} ({$desc})...\n"; $data = file_get_contents($filename); $all = $data; while (strlen($all) < BIGSIZE) { $all .= $all; } $data = $all; echo "Data is " . strlen($data) . " bytes.\n"; $forms = array('quickIsNFCVerify', 'cleanUp');
/** * Returns true if the string is _definitely_ in NFC. * Returns false if not or uncertain. * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML. * @return bool */ static function quickIsNFCVerify(&$string) { return Validator::quickIsNFCVerify($string); }
/** * Normalize string into NFC by using the cleanup method from UtfNormal. * * @param string $inputString The actual string to process. * * @return string */ public function cleanupToNFC($inputString) { $cleaned = $inputString; $cleaned = $this->trimBadChars($cleaned); $cleaned = Validator::cleanUp($cleaned); return $cleaned; }