/** * Replaces common plain text characters into formatted entities * * As an example, * * 'cause today's effort makes it worth tomorrow's "holiday" ... * * Becomes: * * ’cause today’s effort makes it worth tomorrow’s “holiday” … * * Code within certain html blocks are skipped. * * Do not use this function before the 'init' action hook; everything will break. * * @since 0.71 * * @global array $wp_cockneyreplace Array of formatted entities for certain common phrases * @global array $shortcode_tags * @staticvar array $static_characters * @staticvar array $static_replacements * @staticvar array $dynamic_characters * @staticvar array $dynamic_replacements * @staticvar array $default_no_texturize_tags * @staticvar array $default_no_texturize_shortcodes * @staticvar bool $run_texturize * * @param string $text The text to be formatted * @param bool $reset Set to true for unit testing. Translated patterns will reset. * @return string The string replaced with html entities */ function wptexturize($text, $reset = false) { global $wp_cockneyreplace, $shortcode_tags; static $static_characters = null, $static_replacements = null, $dynamic_characters = null, $dynamic_replacements = null, $default_no_texturize_tags = null, $default_no_texturize_shortcodes = null, $run_texturize = true, $apos = null, $prime = null, $double_prime = null, $opening_quote = null, $closing_quote = null, $opening_single_quote = null, $closing_single_quote = null, $open_q_flag = '<!--oq-->', $open_sq_flag = '<!--osq-->', $apos_flag = '<!--apos-->'; // If there's nothing to do, just stop. if (empty($text) || false === $run_texturize) { return $text; } // Set up static variables. Run once only. if ($reset || !isset($static_characters)) { /** * Filter whether to skip running wptexturize(). * * Passing false to the filter will effectively short-circuit wptexturize(). * returning the original text passed to the function instead. * * The filter runs only once, the first time wptexturize() is called. * * @since 4.0.0 * * @see wptexturize() * * @param bool $run_texturize Whether to short-circuit wptexturize(). */ $run_texturize = apply_filters('run_wptexturize', $run_texturize); if (false === $run_texturize) { return $text; } /* translators: opening curly double quote */ $opening_quote = _x('“', 'opening curly double quote'); /* translators: closing curly double quote */ $closing_quote = _x('”', 'closing curly double quote'); /* translators: apostrophe, for example in 'cause or can't */ $apos = _x('’', 'apostrophe'); /* translators: prime, for example in 9' (nine feet) */ $prime = _x('′', 'prime'); /* translators: double prime, for example in 9" (nine inches) */ $double_prime = _x('″', 'double prime'); /* translators: opening curly single quote */ $opening_single_quote = _x('‘', 'opening curly single quote'); /* translators: closing curly single quote */ $closing_single_quote = _x('’', 'closing curly single quote'); /* translators: en dash */ $en_dash = _x('–', 'en dash'); /* translators: em dash */ $em_dash = _x('—', 'em dash'); $default_no_texturize_tags = array('pre', 'code', 'kbd', 'style', 'script', 'tt'); $default_no_texturize_shortcodes = array('code'); // if a plugin has provided an autocorrect array, use it if (isset($wp_cockneyreplace)) { $cockney = array_keys($wp_cockneyreplace); $cockneyreplace = array_values($wp_cockneyreplace); } else { /* translators: This is a comma-separated list of words that defy the syntax of quotations in normal use, * for example... 'We do not have enough words yet' ... is a typical quoted phrase. But when we write * lines of code 'til we have enough of 'em, then we need to insert apostrophes instead of quotes. */ $cockney = explode(',', _x("'tain't,'twere,'twas,'tis,'twill,'til,'bout,'nuff,'round,'cause,'em", 'Comma-separated list of words to texturize in your language')); $cockneyreplace = explode(',', _x('’tain’t,’twere,’twas,’tis,’twill,’til,’bout,’nuff,’round,’cause,’em', 'Comma-separated list of replacement words in your language')); } $static_characters = array_merge(array('...', '``', '\'\'', ' (tm)'), $cockney); $static_replacements = array_merge(array('…', $opening_quote, $closing_quote, ' ™'), $cockneyreplace); // Pattern-based replacements of characters. // Sort the remaining patterns into several arrays for performance tuning. $dynamic_characters = array('apos' => array(), 'quote' => array(), 'dash' => array()); $dynamic_replacements = array('apos' => array(), 'quote' => array(), 'dash' => array()); $dynamic = array(); $spaces = wp_spaces_regexp(); // '99' and '99" are ambiguous among other patterns; assume it's an abbreviated year at the end of a quotation. if ("'" !== $apos || "'" !== $closing_single_quote) { $dynamic['/\'(\\d\\d)\'(?=\\Z|[.,:;!?)}\\-\\]]|>|' . $spaces . ')/'] = $apos_flag . '$1' . $closing_single_quote; } if ("'" !== $apos || '"' !== $closing_quote) { $dynamic['/\'(\\d\\d)"(?=\\Z|[.,:;!?)}\\-\\]]|>|' . $spaces . ')/'] = $apos_flag . '$1' . $closing_quote; } // '99 '99s '99's (apostrophe) But never '9 or '99% or '999 or '99.0. if ("'" !== $apos) { $dynamic['/\'(?=\\d\\d(?:\\Z|(?![%\\d]|[.,]\\d)))/'] = $apos_flag; } // Quoted Numbers like '0.42' if ("'" !== $opening_single_quote && "'" !== $closing_single_quote) { $dynamic['/(?<=\\A|' . $spaces . ')\'(\\d[.,\\d]*)\'/'] = $open_sq_flag . '$1' . $closing_single_quote; } // Single quote at start, or preceded by (, {, <, [, ", -, or spaces. if ("'" !== $opening_single_quote) { $dynamic['/(?<=\\A|[([{"\\-]|<|' . $spaces . ')\'/'] = $open_sq_flag; } // Apostrophe in a word. No spaces, double apostrophes, or other punctuation. if ("'" !== $apos) { $dynamic['/(?<!' . $spaces . ')\'(?!\\Z|[.,:;!?"\'(){}[\\]\\-]|&[lg]t;|' . $spaces . ')/'] = $apos_flag; } $dynamic_characters['apos'] = array_keys($dynamic); $dynamic_replacements['apos'] = array_values($dynamic); $dynamic = array(); // Quoted Numbers like "42" if ('"' !== $opening_quote && '"' !== $closing_quote) { $dynamic['/(?<=\\A|' . $spaces . ')"(\\d[.,\\d]*)"/'] = $open_q_flag . '$1' . $closing_quote; } // Double quote at start, or preceded by (, {, <, [, -, or spaces, and not followed by spaces. if ('"' !== $opening_quote) { $dynamic['/(?<=\\A|[([{\\-]|<|' . $spaces . ')"(?!' . $spaces . ')/'] = $open_q_flag; } $dynamic_characters['quote'] = array_keys($dynamic); $dynamic_replacements['quote'] = array_values($dynamic); $dynamic = array(); // Dashes and spaces $dynamic['/---/'] = $em_dash; $dynamic['/(?<=^|' . $spaces . ')--(?=$|' . $spaces . ')/'] = $em_dash; $dynamic['/(?<!xn)--/'] = $en_dash; $dynamic['/(?<=^|' . $spaces . ')-(?=$|' . $spaces . ')/'] = $en_dash; $dynamic_characters['dash'] = array_keys($dynamic); $dynamic_replacements['dash'] = array_values($dynamic); } // Must do this every time in case plugins use these filters in a context sensitive manner /** * Filter the list of HTML elements not to texturize. * * @since 2.8.0 * * @param array $default_no_texturize_tags An array of HTML element names. */ $no_texturize_tags = apply_filters('no_texturize_tags', $default_no_texturize_tags); /** * Filter the list of shortcodes not to texturize. * * @since 2.8.0 * * @param array $default_no_texturize_shortcodes An array of shortcode names. */ $no_texturize_shortcodes = apply_filters('no_texturize_shortcodes', $default_no_texturize_shortcodes); $no_texturize_tags_stack = array(); $no_texturize_shortcodes_stack = array(); // Look for shortcodes and HTML elements. preg_match_all('@\\[/?([^<>&/\\[\\]\\x00-\\x20]++)@', $text, $matches); $tagnames = array_intersect(array_keys($shortcode_tags), $matches[1]); $found_shortcodes = !empty($tagnames); $shortcode_regex = $found_shortcodes ? _get_wptexturize_shortcode_regex($tagnames) : ''; $regex = _get_wptexturize_split_regex($shortcode_regex); $textarr = preg_split($regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); foreach ($textarr as &$curl) { // Only call _wptexturize_pushpop_element if $curl is a delimiter. $first = $curl[0]; if ('<' === $first) { if ('<!--' === substr($curl, 0, 4)) { // This is an HTML comment delimiter. continue; } else { // This is an HTML element delimiter. // Replace each & with & unless it already looks like an entity. $curl = preg_replace('/&(?!#(?:\\d+|x[a-f0-9]+);|[a-z1-4]{1,8};)/i', '&', $curl); _wptexturize_pushpop_element($curl, $no_texturize_tags_stack, $no_texturize_tags); } } elseif ('' === trim($curl)) { // This is a newline between delimiters. Performance improves when we check this. continue; } elseif ('[' === $first && $found_shortcodes && 1 === preg_match('/^' . $shortcode_regex . '$/', $curl)) { // This is a shortcode delimiter. if ('[[' !== substr($curl, 0, 2) && ']]' !== substr($curl, -2)) { // Looks like a normal shortcode. _wptexturize_pushpop_element($curl, $no_texturize_shortcodes_stack, $no_texturize_shortcodes); } else { // Looks like an escaped shortcode. continue; } } elseif (empty($no_texturize_shortcodes_stack) && empty($no_texturize_tags_stack)) { // This is neither a delimiter, nor is this content inside of no_texturize pairs. Do texturize. $curl = str_replace($static_characters, $static_replacements, $curl); if (false !== strpos($curl, "'")) { $curl = preg_replace($dynamic_characters['apos'], $dynamic_replacements['apos'], $curl); $curl = wptexturize_primes($curl, "'", $prime, $open_sq_flag, $closing_single_quote); $curl = str_replace($apos_flag, $apos, $curl); $curl = str_replace($open_sq_flag, $opening_single_quote, $curl); } if (false !== strpos($curl, '"')) { $curl = preg_replace($dynamic_characters['quote'], $dynamic_replacements['quote'], $curl); $curl = wptexturize_primes($curl, '"', $double_prime, $open_q_flag, $closing_quote); $curl = str_replace($open_q_flag, $opening_quote, $curl); } if (false !== strpos($curl, '-')) { $curl = preg_replace($dynamic_characters['dash'], $dynamic_replacements['dash'], $curl); } // 9x9 (times), but never 0x9999 if (1 === preg_match('/(?<=\\d)x\\d/', $curl)) { // Searching for a digit is 10 times more expensive than for the x, so we avoid doing this one! $curl = preg_replace('/\\b(\\d(?(?<=0)[\\d\\.,]+|[\\d\\.,]*))x(\\d[\\d\\.,]*)\\b/', '$1×$2', $curl); } // Replace each & with & unless it already looks like an entity. $curl = preg_replace('/&(?!#(?:\\d+|x[a-f0-9]+);|[a-z1-4]{1,8};)/i', '&', $curl); } } return implode('', $textarr); }
/** * Automated performance testing of the main regex. * * @dataProvider data_whole_posts */ function test_pcre_performance($input) { global $shortcode_tags; // With Shortcodes Disabled $regex = _get_wptexturize_split_regex(); $result = benchmark_pcre_backtracking($regex, $input, 'split'); $this->assertLessThan(200, $result); // With Shortcodes Enabled $shortcode_regex = _get_wptexturize_shortcode_regex(array_keys($shortcode_tags)); $regex = _get_wptexturize_split_regex($shortcode_regex); $result = benchmark_pcre_backtracking($regex, $input, 'split'); return $this->assertLessThan(200, $result); }