function any_wiki_towiki ( $html, $options = false ) { global $_wiki_xhtml_parser; $html = wiki_normalize_newlines($html); $html = "\n$html\n\n\n"; // html block (escapes all that is within) $html = preg_replace_callback('|<!--\[html\]-->(.*?)<!--\[/html]-->|s', '_towiki_html', $html); // Simple markup $html = preg_replace('!</?(i|em)>!', '//', $html); $html = preg_replace('!</?(b|strong)>!', '**', $html); $html = preg_replace('|<sup><a|', '<a', $html); $html = preg_replace('|</a></sup>|', '</a>', $html); $html = preg_replace('|</?sup>|', '^^', $html); $html = preg_replace('|<tt>|', '{{', $html); $html = preg_replace('|</tt>|', '}}', $html); // coloured texts $html = preg_replace('|<span style="color: #?([^"]*);">(.*?)</span>|s', '##\1|\2##', $html); // mailto links $html = preg_replace('|<!--\[mailto\]-->([^ ]*?)<!--\[/mailto\]-->|', 'mailto:\1', $html); $html = preg_replace('|<!--\[mailto\]-->(.*?)<!--\[/mailto\]-->|', '[mailto:\1]', $html); // images $html = preg_replace('/<!--(\[image .*?\])-->/', '[\1]', $html); // literal texts $html = preg_replace('|<!--\[lit\]-->(.*?)<!--\[/lit]-->|s', '``\1``', $html); // comments $html = preg_replace('|<!-- (.*?) -->|s', '<x-comment>\1</x-comment>', $html); // toc if (strpos($html, '<!--[toc]-->') !== false) { // The [[toc]] tag $html = preg_replace('|<!--\[toc\]-->.*?<!--\[/toc]-->|s', "\n[[toc]]\n", $html); // The toc anchors in the headers $html = preg_replace('|(<h[2-6]>)<a name=\'[0-9]+-[0-9]+\'></a>|', '\1', $html); } // // Make sure that the XML parser gets legal utf8 text $html = any_text_utf8($html); // Parse the remaining text if (!isset($_wiki_xhtml_parser)) { $_wiki_xhtml_parser = new XML_Wiki_Parser(); } $_wiki_xhtml_parser->reset(); $ret = $_wiki_xhtml_parser->parseString("<wiki>\n" . $html . "\n</wiki>", true); if ($_wiki_xhtml_parser->isError($ret)) { // Error in XML - just return the text... sorry... $html = $ret->getMessage() . "\n\n\n" . strip_tags($html); } else { $html = $_wiki_xhtml_parser->wiki; } $_wiki_xhtml_parser->reset(); // $_wiki_xhtml_parser->free(); // Correct <del/><ins/> pairs $html = preg_replace('|(@@---.*?)@@@@(\+\+\+.*?@@)|', '\1\2', $html); // Correct multiple newlines $html = trim(preg_replace("/\n\n\n*/", "\n\n", $html)) . "\n"; // Remove too many spaces at linestarts in blockquotes $html = preg_replace("/\n(>+)[ \t]+/", "\n\\1 ", $html); // When a line is requested, remove superfluous <p/>'s if (!empty($options['target']) && $options['target'] == 'line') { $html = str_replace(array("\n", "<p>", "</p>"), array(' ', '', ''), $html); } return $html; }
function wiki_tokenizer ( $s, $options = array() ) { $s = wiki_normalize_newlines($s) . "\n\n"; $i = 0; // the offset of the scanner $line_offs = 0; // the token offset in the current line $len = strlen($s); // the length of the input stream $tk = array(); // the token list returned $tk_s = array(); // token strings // Get the settings $allow_wikiword = !empty($options['allow_wikiword']); // Translate the character stream into tokens, use the ending "\n" as a buffer. while ($i < $len-2) { $c = $s{$i}; $n_c = $s{$i+1}; $nn_c = $s{$i+2}; $line_offs++; switch ($c) { case "\n": if ($n_c == "\n") { while ($i < $len - 1 && $s{$i+1} == "\n") { $i++; } $tk[] = "p"; $tk_s[] = "\n\n"; } else { $tk[] = "newline"; $tk_s[] = "\n"; } $line_offs = 0; break; case ' ': if ($n_c == '_' && $nn_c == "\n") { $tk[] = 'br'; $tk_s[] = " _\n"; $i += 2; } else { $tok = ' '; while ($s{$i+1} == ' ') { $tok .= ' '; $i++; } if ( $s{$i+1} == '_' && $s{$i+2} == "\n") { $tk[] = 'br'; $tk_s[] = " _\n"; $i += 2; } else { $tk[] = ' '; $tk_s[] = $tok; } } break; case '`': if ($n_c == '`') { $j = $i+2; $tok = ''; while ( $j < $len - 2 && ($s{$j} != '`' || $s{$j+1} != '`')) { $tok .= $s{$j}; $j++; } if ($s{$j} == '`' && $s{$j+1} == '`') { $tk[] = 'literal'; $tk_s[] = str_replace("\n", " ", $tok); $i = $j+1; } else { $tk[] = '`'; $tk_s[] = '`'; } } else { $tk[] = '`'; $tk_s[] = '`'; } break; case '<': // Check for <html> on one line if ( $line_offs == 1 && substr($s, $i, 7) == "<html>\n" && ($end = strpos($s, "\n</html>\n", $i+5)) !== false) { $tk[] = 'html'; $tk_s[] = substr($s, $i+7, $end - ($i+6)); $i = $end + 8; } // Check for <code> on one line else if ( $line_offs == 1 && substr($s, $i, 7) == "<code>\n" && ($end = strpos($s, "\n</code>\n", $i+5)) !== false) { $tk[] = 'code'; $tk_s[] = substr($s, $i+7, $end - ($i+6)); $i = $end + 8; } // Check for a <!-- ... --> block else if ( substr($s, $i, 4) == '<!--' && ($end = strpos($s, '-->', $i+4)) !== false) { $tk[] = 'comment'; $tk_s[] = trim(substr($s, $i+4, $end - ($i+4))); $i = $end + 2; } else { $tk[] = '<'; $tk_s[] = '<'; } break; case '/': if ($n_c == '/') { $tk[] = "em"; $tk_s[] = "//"; $i+=1; } else { $tk[] = $c; $tk_s[] = $c; } break; case '*': if ($n_c == '*') { $tk[] = "strong"; $tk_s[] = "**"; $i+=1; } else { $tk[] = $c; $tk_s[] = $c; } break; case '^': if ($n_c == '^') { $tk[] = "sup"; $tk_s[] = "^^"; $i++; } else { $tk[] = $c; $tk_s[] = $c; } break; case '@': case '#': case '(': case ')': case '|': case '[': case ']': case '{': case '}': if ($c == '[' & $n_c == '[') { // check for block-level [[toc]] if ( $line_offs == 1 && substr($s, $i, 8) == "[[toc]]\n") { $tk[] = 'toc'; $tk_s[] = '[[toc]]'; $i += 6; } else { $tk[] = $c.$c; $tk_s[] = $c.$c; $i++; } } else if ($n_c == $c) { $tk[] = $c.$c; $tk_s[] = $c.$c; $i++; } else { $tk[] = $c; $tk_s[] = $c; } break; case '>': $tok = '>'; while ($s{$i+1} == '>') { $tok .= '>'; $i++; } $tk[] = ">"; $tk_s[] = $tok; break; case '\'': if ($n_c == '\'' && $nn_c == '\'') { $tk[] = "strong"; $tk_s[] = "'''"; $i+=2; } else if ($n_c == '\'') { $tk[] = "em"; $tk_s[] = "''"; $i+=1; } else { $tk[] = $c; $tk_s[] = $c; } break; case ':': if ($n_c == '/' && $nn_c == '/') { $tk[] = '://'; $tk_s[] = '://'; $i += 2; } else { $tk[] = ':'; $tk_s[] = ':'; } break; default: $class = _charclass($c); $tok = $c; $j = $i; while ($class == _charclass($s{$j+1}) && $j < $len - 2) { $j++; $tok .= $s{$j}; } if ($class == 'word') { if ( (($tok == 'http' || $tok == 'https') && substr($s, $j+1, 3) == '://') || ($tok == 'mailto' && $s[$j+1] == ':')) { // http:// or mailto: -- fetch till whitespace or one of "])|>" if ($tok == 'mailto') { $class = 'mailto'; } else { $class = 'url'; } while (strpos("\n\t |[](){}<>\"'", $s{$j+1}) === false) { $j++; $tok .= $s{$j}; } } else if ( $allow_wikiword && $c >= 'A' && $c <= 'Z' && preg_match('/^[A-Z][a-z0-9_]+[A-Z][a-zA-Z0-9_]*$/', $tok)) { $class = "wiki-word"; } } $tk[] = $class; $tk_s[] = $tok; $i = $j; break; } $i++; } $tk[] = 'end'; $tk_s[] = ''; return array($tk, $tk_s); }