function entities_to_7bit($str)
{
    require_once LEPTON_PATH . '/framework/summary.utf8.php';
    // convert to UTF-8
    $str = charset_to_utf8($str);
    if (!utf8_check($str)) {
        return $str;
    }
    // replace some specials
    $str = utf8_stripspecials($str, '_');
    // translate non-ASCII characters to ASCII
    $str = utf8_romanize($str);
    // missed some? - Many UTF-8-chars can't be romanized
    // convert to HTML-entities, and replace entites by hex-numbers
    $str = utf8_fast_umlauts_to_entities($str, false);
    $str = str_replace(''', ''', $str);
    //    $str = preg_replace_callback('/&#([0-9]+);/', function($matches) {return "dechex($matches[1])";}, $str);
    //    $str = preg_replace_callback('/&#([0-9]+);/', function($matches) {return dechex($matches[1]);}, $str);
    if (version_compare(PHP_VERSION, '5.3', '<')) {
        $str = preg_replace('/&#([0-9]+);/e', "dechex('\$1')", $str);
    } else {
        $str = preg_replace_callback('/&#([0-9]+);/', create_function('$aMatches', 'return dechex($aMatches[1]);'), $str);
    }
    // maybe there are some &gt; &lt; &apos; &quot; &amp; &nbsp; left, replace them too
    $str = str_replace(array('&gt;', '&lt;', '&apos;', '\'', '&quot;', '&amp;'), '', $str);
    $str = str_replace('&amp;', '', $str);
    return $str;
}
 function test1()
 {
     // we test multiple cases here - format: string, repl, additional, test
     $tests = array();
     $tests[] = array('asciistring', '', '', 'asciistring');
     $tests[] = array('asciistring', '', '\\._\\-:', 'asciistring');
     $tests[] = array('ascii.string', '', '\\._\\-:', 'asciistring');
     $tests[] = array('ascii.string', ' ', '\\._\\-:', 'ascii string');
     $tests[] = array('2.1.14', ' ', '\\._\\-:', '2 1 14');
     $tests[] = array('ascii.string', '', '\\._\\-:\\*', 'asciistring');
     $tests[] = array('ascii.string', ' ', '\\._\\-:\\*', 'ascii string');
     $tests[] = array('2.1.14', ' ', '\\._\\-:\\*', '2 1 14');
     foreach ($tests as $test) {
         $this->assertEqual(utf8_stripspecials($test[0], $test[1], $test[2]), $test[3]);
     }
 }
Example #3
0
function cleanID($raw_id)
{
    $sepchar = "_";
    $sepcharpat = '#\\' . $sepchar . '+#';
    $id = trim((string) $raw_id);
    $id = utf8_strtolower($id);
    //alternative namespace seperator
    $id = strtr($id, ';', ':');
    $id = strtr($id, '/', $sepchar);
    $id = utf8_romanize($id);
    $id = utf8_deaccent($id, -1);
    //remove specials
    $id = utf8_stripspecials($id, $sepchar, '\\*');
    $id = utf8_strip($id);
    $id = preg_replace($sepcharpat, $sepchar, $id);
    $id = preg_replace('#:+#', ':', $id);
    $id = preg_replace('#:[:\\._\\-]+#', ':', $id);
    return $id;
}
Example #4
0
 /**
  * Send the wanted code block to the browser
  *
  * When the correct block was found it exits the script.
  */
 function code($text, $language = NULL, $filename = '')
 {
     global $INPUT;
     if (!$language) {
         $language = 'txt';
     }
     if (!$filename) {
         $filename = 'snippet.' . $language;
     }
     $filename = utf8_basename($filename);
     $filename = utf8_stripspecials($filename, '_');
     if ($this->_codeblock == $INPUT->str('codeblock')) {
         header("Content-Type: text/plain; charset=utf-8");
         header("Content-Disposition: attachment; filename={$filename}");
         header("X-Robots-Tag: noindex");
         echo trim($text, "\r\n");
         exit;
     }
     $this->_codeblock++;
 }
Example #5
0
 /**
  * Send the wanted code block to the browser
  *
  * When the correct block was found it exits the script.
  */
 function code($text, $language = null, $filename = '')
 {
     global $INPUT;
     if (!$language) {
         $language = 'txt';
     }
     if (!$filename) {
         $filename = 'snippet.' . $language;
     }
     $filename = utf8_basename($filename);
     $filename = utf8_stripspecials($filename, '_');
     // send CRLF to Windows clients
     if (strpos($INPUT->server->str('HTTP_USER_AGENT'), 'Windows') !== false) {
         $text = str_replace("\n", "\r\n", $text);
     }
     if ($this->_codeblock == $INPUT->str('codeblock')) {
         header("Content-Type: text/plain; charset=utf-8");
         header("Content-Disposition: attachment; filename={$filename}");
         header("X-Robots-Tag: noindex");
         echo trim($text, "\r\n");
         exit;
     }
     $this->_codeblock++;
 }
Example #6
0
/**
 * Tokenizes a string into an array of search words
 *
 * Uses the same algorithm as idx_getPageWords()
 *
 * @param string   $string     the query as given by the user
 * @param arrayref $stopwords  array of stopwords
 * @param boolean  $wc         are wildcards allowed?
 */
function idx_tokenizer($string, &$stopwords, $wc = false)
{
    $words = array();
    $wc = $wc ? '' : ($wc = '\\*');
    if (preg_match('/[^0-9A-Za-z]/u', $string)) {
        // handle asian chars as single words (may fail on older PHP version)
        $asia = @preg_replace('/(' . IDX_ASIAN . ')/u', ' \\1 ', $string);
        if (!is_null($asia)) {
            $string = $asia;
        }
        //recover from regexp failure
        $arr = explode(' ', utf8_stripspecials($string, ' ', '\\._\\-:' . $wc));
        foreach ($arr as $w) {
            if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) {
                continue;
            }
            $w = utf8_strtolower($w);
            if ($stopwords && is_int(array_search("{$w}\n", $stopwords))) {
                continue;
            }
            $words[] = $w;
        }
    } else {
        $w = $string;
        if (!is_numeric($w) && strlen($w) < IDX_MINWORDLENGTH) {
            return $words;
        }
        $w = strtolower($w);
        if (is_int(array_search("{$w}\n", $stopwords))) {
            return $words;
        }
        $words[] = $w;
    }
    return $words;
}
Example #7
0
 /**
  * Split the text into words for fulltext search
  *
  * TODO: does this also need &$stopwords ?
  *
  * @triggers INDEXER_TEXT_PREPARE
  * This event allows plugins to modify the text before it gets tokenized.
  * Plugins intercepting this event should also intercept INDEX_VERSION_GET
  *
  * @param string    $text   plain text
  * @param boolean   $wc     are wildcards allowed?
  * @return array            list of words in the text
  * @author Tom N Harris <*****@*****.**>
  * @author Andreas Gohr <*****@*****.**>
  */
 public function tokenizer($text, $wc = false)
 {
     $wc = $wc ? '' : '\\*';
     $stopwords =& idx_get_stopwords();
     // prepare the text to be tokenized
     $evt = new Doku_Event('INDEXER_TEXT_PREPARE', $text);
     if ($evt->advise_before(true)) {
         if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
             // handle asian chars as single words (may fail on older PHP version)
             $asia = @preg_replace('/(' . IDX_ASIAN . ')/u', ' \\1 ', $text);
             if (!is_null($asia)) {
                 $text = $asia;
             }
             // recover from regexp falure
         }
     }
     $evt->advise_after();
     unset($evt);
     $text = strtr($text, array("\r" => ' ', "\n" => ' ', "\t" => ' ', "­" => ''));
     if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
         $text = utf8_stripspecials($text, ' ', '\\._\\-:' . $wc);
     }
     $wordlist = explode(' ', $text);
     foreach ($wordlist as $i => $word) {
         $wordlist[$i] = preg_match('/[^0-9A-Za-z]/u', $word) ? utf8_strtolower($word) : strtolower($word);
     }
     foreach ($wordlist as $i => $word) {
         if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH || array_search($word, $stopwords, true) !== false) {
             unset($wordlist[$i]);
         }
     }
     return array_values($wordlist);
 }
Example #8
0
/**
 * Remove unwanted chars from ID
 *
 * Cleans a given ID to only use allowed characters. Accented characters are
 * converted to unaccented ones
 *
 * @author Andreas Gohr <*****@*****.**>
 * @param  string  $raw_id    The pageid to clean
 * @param  boolean $ascii     Force ASCII
 * @return string cleaned id
 */
function cleanID($raw_id, $ascii = false)
{
    global $conf;
    static $sepcharpat = null;
    global $cache_cleanid;
    $cache =& $cache_cleanid;
    if ($conf['syslog']) {
        syslog(LOG_WARNING, '[pageutils.php] cleanID: raw_id: ' . $raw_id);
    }
    // check if it's already in the memory cache
    if (isset($cache[(string) $raw_id])) {
        return $cache[(string) $raw_id];
    }
    $sepchar = $conf['sepchar'];
    if ($sepcharpat == null) {
        // build string only once to save clock cycles
        $sepcharpat = '#\\' . $sepchar . '+#';
    }
    $id = trim((string) $raw_id);
    if ($conf['mixedcase'] == 0) {
        $id = utf8_strtolower($id);
    }
    //alternative namespace seperator
    if ($conf['useslash']) {
        $id = strtr($id, ';/', '::');
    } else {
        $id = strtr($id, ';/', ':' . $sepchar);
    }
    if ($conf['deaccent'] == 2 || $ascii) {
        $id = utf8_romanize($id);
    }
    if ($conf['deaccent'] || $ascii) {
        $id = utf8_deaccent($id, -1);
    }
    //remove specials if specialcharacters is set to 0
    if ($conf['specialcharacters'] == 0) {
        $id = utf8_stripspecials($id, $sepchar, '\\*');
    }
    if ($ascii) {
        $id = utf8_strip($id);
    }
    //clean up
    $id = preg_replace($sepcharpat, $sepchar, $id);
    $id = preg_replace('#:+#', ':', $id);
    $id = trim($id, ':._-');
    $id = preg_replace('#:[:\\._\\-]+#', ':', $id);
    $id = preg_replace('#[:\\._\\-]+:#', ':', $id);
    $cache[(string) $raw_id] = $id;
    if ($conf['syslog']) {
        syslog(LOG_WARNING, '[pageutils.php] cleanID: id to be returned: ' . $id);
    }
    return $id;
}
Example #9
0
/**
 * Remove unwanted chars from ID
 *
 * Cleans a given ID to only use allowed characters. Accented characters are
 * converted to unaccented ones
 *
 * @author Andreas Gohr <*****@*****.**>
 * @param  string  $raw_id    The pageid to clean
 * @param  boolean $ascii     Force ASCII
 * @param  boolean $media     Allow leading or trailing _ for media files
 */
function cleanID($raw_id, $ascii = false, $media = false)
{
    global $conf;
    static $sepcharpat = null;
    global $cache_cleanid;
    $cache =& $cache_cleanid;
    // check if it's already in the memory cache
    if (isset($cache[(string) $raw_id])) {
        return $cache[(string) $raw_id];
    }
    $sepchar = $conf['sepchar'];
    if ($sepcharpat == null) {
        // build string only once to save clock cycles
        $sepcharpat = '#\\' . $sepchar . '+#';
    }
    $id = trim((string) $raw_id);
    $id = utf8_strtolower($id);
    //alternative namespace seperator
    $id = strtr($id, ';', ':');
    if ($conf['useslash']) {
        $id = strtr($id, '/', ':');
    } else {
        $id = strtr($id, '/', $sepchar);
    }
    if ($conf['deaccent'] == 2 || $ascii) {
        $id = utf8_romanize($id);
    }
    if ($conf['deaccent'] || $ascii) {
        $id = utf8_deaccent($id, -1);
    }
    //remove specials
    $id = utf8_stripspecials($id, $sepchar, '\\*');
    if ($ascii) {
        $id = utf8_strip($id);
    }
    //clean up
    $id = preg_replace($sepcharpat, $sepchar, $id);
    $id = preg_replace('#:+#', ':', $id);
    $id = $media ? trim($id, ':.-') : trim($id, ':._-');
    $id = preg_replace('#:[:\\._\\-]+#', ':', $id);
    $cache[(string) $raw_id] = $id;
    return $id;
}
 /**
  * Log external search queries
  *
  * Will not write anything if the referer isn't a search engine
  */
 public function log_externalsearch($referer, &$type)
 {
     $referer = utf8_strtolower($referer);
     include dirname(__FILE__) . '/searchengines.php';
     /** @var array $SEARCHENGINES */
     $query = '';
     $name = '';
     // parse the referer
     $urlparts = parse_url($referer);
     $domain = $urlparts['host'];
     $qpart = $urlparts['query'];
     if (!$qpart) {
         $qpart = $urlparts['fragment'];
     }
     //google does this
     $params = array();
     parse_str($qpart, $params);
     // check domain against common search engines
     foreach ($SEARCHENGINES as $regex => $info) {
         if (preg_match('/' . $regex . '/', $domain)) {
             $type = 'search';
             $name = array_shift($info);
             // check the known parameters for content
             foreach ($info as $k) {
                 if (empty($params[$k])) {
                     continue;
                 }
                 $query = $params[$k];
                 break;
             }
             break;
         }
     }
     // try some generic search engin parameters
     if ($type != 'search') {
         foreach (array('search', 'query', 'q', 'keywords', 'keyword') as $k) {
             if (empty($params[$k])) {
                 continue;
             }
             $query = $params[$k];
             // we seem to have found some generic search, generate name from domain
             $name = preg_replace('/(\\.co)?\\.([a-z]{2,5})$/', '', $domain);
             //strip tld
             $name = explode('.', $name);
             $name = array_pop($name);
             $type = 'search';
             break;
         }
     }
     // still no hit? return
     if ($type != 'search') {
         return;
     }
     // clean the query
     $query = preg_replace('/^(cache|related):[^\\+]+/', '', $query);
     // non-search queries
     $query = preg_replace('/ +/', ' ', $query);
     // ws compact
     $query = trim($query);
     if (!utf8_check($query)) {
         $query = utf8_encode($query);
     }
     // assume latin1 if not utf8
     // no query? no log
     if (!$query) {
         return;
     }
     // log it!
     $words = explode(' ', utf8_stripspecials($query, ' ', '\\._\\-:\\*'));
     $this->log_search($_REQUEST['p'], $query, $words, $name);
 }
 function utf8_stripspecials($string, $repl = '', $keep = '')
 {
     return utf8_stripspecials($string, $repl, $keep);
 }
Example #12
0
function entities_to_7bit($str)
{
    // convert to UTF-8
    $str = charset_to_utf8($str);
    if (!utf8_check($str)) {
        return $str;
    }
    // replace some specials
    $str = utf8_stripspecials($str, '_');
    // translate non-ASCII characters to ASCII
    $str = utf8_romanize($str);
    // missed some? - Many UTF-8-chars can't be romanized
    // convert to HTML-entities, and replace entites by hex-numbers
    $str = utf8_fast_umlauts_to_entities($str, false);
    $str = str_replace('&#039;', '&apos;', $str);
    $str = preg_replace('/&#([0-9]+);/e', "dechex('\$1')", $str);
    // maybe there are some &gt; &lt; &apos; &quot; &amp; &nbsp; left, replace them too
    $str = str_replace(array('&gt;', '&lt;', '&apos;', '\'', '&quot;', '&amp;'), '', $str);
    $str = str_replace('&amp;', '', $str);
    return $str;
}