/** * Convert special chars (like german umlauts) to ASCII characters. * * @todo dh> IMHO this function should not be included in a file that gets used often/always. * @param string * @return string */ function replace_special_chars($str) { global $evo_charset; if (can_convert_charsets('UTF-8', $evo_charset) && can_convert_charsets('UTF-8', 'ISO-8859-1')) { $str = convert_charset($str, 'UTF-8', $evo_charset); // TODO: add more...?! $search = array('Ä', 'ä', 'Ö', 'ö', 'Ü', 'ü', 'ß', 'à', 'ç', 'è', 'é', 'ì', 'ò', 'ô', 'ù'); // iso-8859-1 $replace = array('Ae', 'ae', 'Oe', 'oe', 'Ue', 'ue', 'ss', 'a', 'c', 'e', 'e', 'i', 'o', 'o', 'u'); foreach ($search as $k => $v) { // convert $search to UTF-8 $search[$k] = convert_charset($v, 'UTF-8', 'ISO-8859-1'); } $str = str_replace($search, $replace, $str); // Replace HTML entities $str = htmlentities($str, ENT_NOQUOTES, 'UTF-8'); } else { // Replace HTML entities only $str = htmlentities($str, ENT_NOQUOTES, $evo_charset); } // Keep only one char in entities! $str = preg_replace('/&(.).+?;/', '$1', $str); // Replace non acceptable chars $str = preg_replace('/[^A-Za-z0-9_]+/', '-', $str); // Remove '-' at start and end: $str = preg_replace('/^-+/', '', $str); $str = preg_replace('/-+$/', '', $str); return $str; }
/** * Test {@link bpost_count_words()}. */ function test_bpost_count_words() { global $evo_charset; if (!can_convert_charsets('ISO-8859-1', 'UTF-8')) { echo 'Skipping tests (cannot convert charsets)...<br />', "\n"; return; } $old_evo_charset = $evo_charset; $evo_charset = 'ISO-8859-1'; $this->assertEqual(bpost_count_words(convert_charset('eine gleichung wie 1 + 2 = 9 /', 'ISO-8859-1', 'UTF-8')), 3); $this->assertEqual(bpost_count_words(convert_charset('mixed with the 3 ümläuts: äää ööö üüü ÄÄÄ ÖÖÖ ÜÜÜ', 'ISO-8859-1', 'UTF-8')), 10); $evo_charset = 'UTF-8'; $this->assertEqual(bpost_count_words('möre (again 3) ümläüts... öö üü ää ÄÄ ÖÖ ÜÜ'), 9); $this->assertEqual(bpost_count_words('russian: Расширенные возможности - это удобный'), 5); $this->assertEqual(bpost_count_words('A versão foi apelidade de Tilqi, porque era aniversário dele. numbers: 42'), 11); $this->assertEqual(bpost_count_words('HTML tags -> <a href="http://b2evolution.net" target="_blank">visit b2evo!</a>. Some other chars: "\' \' " <<< < >>> > ``` -- versão удобный überladen'), 10); $evo_charset = $old_evo_charset; }
/** * Convert special chars (like german umlauts) to ASCII characters. * * @param string Input string to operate on * @param NULL|string The post locale or NULL if there is no specific locale. * Gets passed to evo_iconv_transliterate(). * @return string The input string with replaced chars. */ function replace_special_chars($str, $post_locale = NULL) { global $evo_charset, $default_locale, $current_locale, $locales; // Decode entities to be able to transliterate the associated chars: // Tblue> TODO: Check if this could have side effects. $str = html_entity_decode($str, ENT_NOQUOTES, $evo_charset); $our_locale = $post_locale; if ($our_locale === NULL) { // post locale is not set, try to guess current locale if (!empty($default_locale)) { $our_locale = $default_locale; } if (!empty($current_locale)) { // Override with current locale if available $our_locale = $current_locale; } } if ($our_locale !== NULL && isset($locales[$our_locale]) && !empty($locales[$our_locale]['transliteration_map'])) { // Use locale 'transliteration_map' if present if (!array_key_exists('', $locales[$our_locale]['transliteration_map'])) { // Make sure there's no empty string key, otherwise strtr() returns false if ($tmp_str = strtr($str, $locales[$our_locale]['transliteration_map'])) { } // Use newly transliterated string $str = $tmp_str; } } if (($newstr = evo_iconv_transliterate($str, $post_locale)) !== false) { // iconv allows us to get nice URL titles by transliterating non-ASCII chars. // Tblue> htmlentities() does not know anything about ASCII?! ISO-8859-1 will work too, though. $newstr_charset = 'ISO-8859-1'; } else { if (can_convert_charsets('UTF-8', $evo_charset) && can_convert_charsets('UTF-8', 'ISO-8859-1')) { // Fallback to the limited old method: Transliterate only a few known chars. $newstr = convert_charset($str, 'UTF-8', $evo_charset); $newstr_charset = 'UTF-8'; $search = array('Ä', 'ä', 'Ö', 'ö', 'Ü', 'ü', 'ß', 'à', 'ç', 'è', 'é', 'ì', 'ò', 'ô', 'ù'); // iso-8859-1 $replace = array('Ae', 'ae', 'Oe', 'oe', 'Ue', 'ue', 'ss', 'a', 'c', 'e', 'e', 'i', 'o', 'o', 'u'); foreach ($search as $k => $v) { // convert $search to UTF-8 $search[$k] = convert_charset($v, 'UTF-8', 'ISO-8859-1'); } $newstr = str_replace($search, $replace, $newstr); } else { // Replace HTML entities only. $newstr = $str; $newstr_charset = $evo_charset; } } // Replace HTML entities $newstr = htmlentities($newstr, ENT_NOQUOTES, $newstr_charset); // Handle special entities (e.g., use "-" instead of "a" for "&"): $newstr = str_replace(array('&', '«', '»'), '-', $newstr); // Keep only one char in entities! $newstr = preg_replace('/&(.).+?;/', '$1', $newstr); // Replace non acceptable chars $newstr = preg_replace('/[^A-Za-z0-9_]+/', '-', $newstr); // Remove '-' at start and end: $newstr = preg_replace('/^-+/', '', $newstr); $newstr = preg_replace('/-+$/', '', $newstr); //pre_dump( $str, $newstr ); return $newstr; }
/** * Check the content of a given URL (referer), if the requested URI (with different hostname variations) * is present. * * @todo Use DB cache to avoid checking the same page again and again! (Plugin DB table) * * @param string * @param string URI to append to matching pattern for hostnames * @return boolean */ function is_referer_linking_us($referer, $uri) { global $misc_inc_path, $lib_subdir, $ReqHost; if (empty($referer)) { return false; } // Load page content (max. 500kb), using fsockopen: $url_parsed = @parse_url($referer); if (!$url_parsed) { return false; } if (empty($url_parsed['scheme'])) { $url_parsed = parse_url('http://' . $referer); } $host = $url_parsed['host']; $port = empty($url_parsed['port']) ? 80 : $url_parsed['port']; $path = empty($url_parsed['path']) ? '/' : $url_parsed['path']; if (!empty($url_parsed['query'])) { $path .= '?' . $url_parsed['query']; } $fp = @fsockopen($host, $port, $errno, $errstr, 30); if (!$fp) { // could not access referring page $this->debug_log('is_referer_linking_us(): could not access «' . $referer . '» (host: ' . $host . '): ' . $errstr . ' (#' . $errno . ')'); return false; } // Set timeout for data: if (function_exists('stream_set_timeout')) { stream_set_timeout($fp, 20); } else { socket_set_timeout($fp, 20); } // PHP 4 // Send request: $out = "GET {$path} HTTP/1.0\r\n"; $out .= "Host: {$host}:{$port}\r\n"; $out .= "Connection: Close\r\n\r\n"; fwrite($fp, $out); // Skip headers: $i = 0; $source_charset = 'iso-8859-1'; // default while (($s = fgets($fp, 4096)) !== false) { $i++; if ($s == "\r\n" || $i > 100) { break; } if (preg_match('~^Content-Type:.*?charset=([\\w-]+)~i', $s, $match)) { $source_charset = $match[1]; } } // Get the refering page's content $content_ref_page = ''; $bytes_read = 0; while (($s = fgets($fp, 4096)) !== false) { $content_ref_page .= $s; $bytes_read += strlen($s); if ($bytes_read > 512000) { // do not pull more than 500kb of data! break; } } fclose($fp); if (!strlen($content_ref_page)) { $this->debug_log('is_referer_linking_us(): empty $content_ref_page (' . bytesreadable($bytes_read) . ' read)'); return false; } $have_idn_name = false; // Build the search pattern: // We match for basically for 'href="[SERVER][URI]', where [SERVER] is a list of possible hosts (especially IDNA) $search_pattern = '~\\shref=["\']?https?://('; $possible_hosts = array($_SERVER['HTTP_HOST']); if ($_SERVER['SERVER_NAME'] != $_SERVER['HTTP_HOST']) { $possible_hosts[] = $_SERVER['SERVER_NAME']; } $search_pattern_hosts = array(); foreach ($possible_hosts as $l_host) { if (preg_match('~^([^.]+\\.)(.*?)([^.]+\\.[^.]+)$~', $l_host, $match)) { // we have subdomains in this hostname if (stristr($match[1], 'www')) { // search also for hostname without 'www.' $search_pattern_hosts[] = $match[2] . $match[3]; } } $search_pattern_hosts[] = $l_host; } $search_pattern_hosts = array_unique($search_pattern_hosts); foreach ($search_pattern_hosts as $l_host) { // add IDN, because this could be linked: $l_idn_host = idna_decode($l_host); // the decoded puny-code ("xn--..") name (utf8) if ($l_idn_host != $l_host) { $have_idn_name = true; $search_pattern_hosts[] = $l_idn_host; } } // add hosts to pattern, preg_quoted for ($i = 0, $n = count($search_pattern_hosts); $i < $n; $i++) { $search_pattern_hosts[$i] = preg_quote($search_pattern_hosts[$i], '~'); } $search_pattern .= implode('|', $search_pattern_hosts) . ')'; if (empty($uri)) { // host(s) should end with "/", "'", '"', "?" or whitespace $search_pattern .= '[/"\'\\s?]'; } else { $search_pattern .= preg_quote($uri, '~'); // URI should end with "'", '"' or whitespace $search_pattern .= '["\'\\s]'; } $search_pattern .= '~i'; if ($have_idn_name) { // Convert charset to UTF-8, because the decoded domain name is UTF-8, too: if (can_convert_charsets('utf-8', $source_charset)) { $content_ref_page = convert_charset($content_ref_page, 'utf-8', $source_charset); } else { $this->debug_log('is_referer_linking_us(): warning: cannot convert charset of referring page'); } } if (preg_match($search_pattern, $content_ref_page)) { $this->debug_log('is_referer_linking_us(): found current URL in page (' . bytesreadable($bytes_read) . ' read)'); return true; } else { if (strpos($referer, $ReqHost) === 0 && !empty($uri)) { // Referer is the same host.. just search for $uri if (strpos($content_ref_page, $uri) !== false) { $this->debug_log('is_referer_linking_us(): found current URI in page (' . bytesreadable($bytes_read) . ' read)'); return true; } } $this->debug_log('is_referer_linking_us(): ' . sprintf('did not find «%s» in «%s» (%s bytes read).', $search_pattern, $referer, bytesreadable($bytes_read))); return false; } }