Exemplo n.º 1
0
function any_wiki_towiki ( $html, $options = false )
{
	global $_wiki_xhtml_parser;
	
	$html = wiki_normalize_newlines($html);
	$html = "\n$html\n\n\n";

	// html block (escapes all that is within)
	$html = preg_replace_callback('|<!--\[html\]-->(.*?)<!--\[/html]-->|s', '_towiki_html', $html);

	// Simple markup
	$html = preg_replace('!</?(i|em)>!', 		'//', 	$html);
	$html = preg_replace('!</?(b|strong)>!',	'**', 	$html);
	$html = preg_replace('|<sup><a|', 			'<a', 	$html);
	$html = preg_replace('|</a></sup>|', 		'</a>',	$html);
	$html = preg_replace('|</?sup>|',	 		'^^', 	$html);
	$html = preg_replace('|<tt>|',				'{{',	$html);
	$html = preg_replace('|</tt>|',				'}}',	$html);

	// coloured texts
	$html = preg_replace('|<span style="color: #?([^"]*);">(.*?)</span>|s', '##\1|\2##', $html);

	// mailto links
	$html = preg_replace('|<!--\[mailto\]-->([^ ]*?)<!--\[/mailto\]-->|', 'mailto:\1', $html);
	$html = preg_replace('|<!--\[mailto\]-->(.*?)<!--\[/mailto\]-->|',    '[mailto:\1]', $html);

	// images
	$html = preg_replace('/<!--(\[image .*?\])-->/', '[\1]', $html);

	// literal texts
	$html = preg_replace('|<!--\[lit\]-->(.*?)<!--\[/lit]-->|s', '``\1``', $html);

	// comments
	$html = preg_replace('|<!-- (.*?) -->|s', '<x-comment>\1</x-comment>', $html);

	// toc
	if (strpos($html, '<!--[toc]-->') !== false)
	{
		// The [[toc]] tag
		$html = preg_replace('|<!--\[toc\]-->.*?<!--\[/toc]-->|s', "\n[[toc]]\n", $html);

		// The toc anchors in the headers		
		$html = preg_replace('|(<h[2-6]>)<a name=\'[0-9]+-[0-9]+\'></a>|', '\1', $html);
	}

	//
	// Make sure that the XML parser gets legal utf8 text
	$html = any_text_utf8($html);

	// Parse the remaining text 
	if (!isset($_wiki_xhtml_parser))
	{
		$_wiki_xhtml_parser = new XML_Wiki_Parser();
	}
	$_wiki_xhtml_parser->reset();

	$ret = $_wiki_xhtml_parser->parseString("<wiki>\n" . $html . "\n</wiki>", true);
	if ($_wiki_xhtml_parser->isError($ret))
	{
		// Error in XML - just return the text... sorry...
		$html = $ret->getMessage() . "\n\n\n" . strip_tags($html);
	}
	else
	{
		$html = $_wiki_xhtml_parser->wiki;
	}
	$_wiki_xhtml_parser->reset();
//	$_wiki_xhtml_parser->free();
	
	// Correct <del/><ins/> pairs
	$html = preg_replace('|(@@---.*?)@@@@(\+\+\+.*?@@)|', '\1\2', $html);

	// Correct multiple newlines
	$html = trim(preg_replace("/\n\n\n*/", "\n\n", $html)) . "\n";
	
	// Remove too many spaces at linestarts in blockquotes
	$html = preg_replace("/\n(>+)[ \t]+/", "\n\\1 ", $html);
	
	// When a line is requested, remove superfluous <p/>'s
	if (!empty($options['target']) && $options['target'] == 'line')
	{
		$html = str_replace(array("\n", "<p>", "</p>"), array(' ', '', ''), $html);
	}
	return $html;
}
function wiki_tokenizer ( $s, $options = array() )
{
	$s  		= wiki_normalize_newlines($s) . "\n\n";
	$i  		= 0;			// the offset of the scanner
	$line_offs	= 0;			// the token offset in the current line
	$len 		= strlen($s);	// the length of the input stream

	$tk			= array();		// the token list returned
	$tk_s		= array();		// token strings

	// Get the settings
	$allow_wikiword	= !empty($options['allow_wikiword']);
	
	// Translate the character stream into tokens, use the ending "\n" as a buffer.
	while ($i < $len-2)
	{
		$c		= $s{$i};
		$n_c	= $s{$i+1};
		$nn_c	= $s{$i+2};
		
		$line_offs++;

		switch ($c)
		{
		case "\n":
			if ($n_c == "\n")
			{
				while ($i < $len - 1 && $s{$i+1} == "\n")
				{
					$i++;
				}
				$tk[]   = "p";
				$tk_s[] = "\n\n";
			}
			else
			{
				$tk[]   = "newline";
				$tk_s[] = "\n";
			}
			$line_offs = 0;
			break;
		
		case ' ':
			if ($n_c == '_'	&& $nn_c == "\n")
			{
				$tk[]   = 'br';
				$tk_s[] = " _\n";
				$i   += 2;
			}
			else
			{
				$tok = ' ';
				while ($s{$i+1} == ' ')
				{
					$tok .= ' ';
					$i++;
				}
				
				if (	$s{$i+1} == '_'
					&&	$s{$i+2} == "\n")
				{
					$tk[]   = 'br';
					$tk_s[] = " _\n";
					$i   += 2;
				}
				else
				{
					$tk[]   = ' ';
					$tk_s[] = $tok;
				}
			}
			break;
		
		case '`':
			if ($n_c == '`')
			{
				$j   = $i+2;
				$tok = '';
				while (		$j < $len - 2
						&&	($s{$j} != '`' || $s{$j+1} != '`'))
				{
					$tok .= $s{$j};
					$j++;
				}
				if ($s{$j} == '`' && $s{$j+1} == '`')
				{
					$tk[]   = 'literal';
					$tk_s[] = str_replace("\n", " ", $tok);
					$i      = $j+1;
				}
				else
				{
					$tk[]	= '`';
					$tk_s[] = '`';
				}
			}
			else
			{
				$tk[]	= '`';
				$tk_s[] = '`';
			}
			break;

		case '<':
			// Check for <html> on one line
			if (	$line_offs == 1
				&&	substr($s, $i, 7) == "<html>\n"
				&&	($end = strpos($s, "\n</html>\n", $i+5)) !== false)
			{
				$tk[]	= 'html';
				$tk_s[] = substr($s, $i+7, $end - ($i+6));
				$i		= $end + 8;
			}
			// Check for <code> on one line
			else if (	$line_offs == 1
					&&	substr($s, $i, 7) == "<code>\n"
					&&	($end = strpos($s, "\n</code>\n", $i+5)) !== false)
			{
				$tk[]	= 'code';
				$tk_s[] = substr($s, $i+7, $end - ($i+6));
				$i		= $end + 8;
			}
			// Check for a <!-- ... --> block
			else if (	substr($s, $i, 4) == '<!--'
					&&	($end = strpos($s, '-->', $i+4)) !== false)
			{
				$tk[]	= 'comment';
				$tk_s[] = trim(substr($s, $i+4, $end - ($i+4)));
				$i		= $end + 2;
			}
			else
			{
				$tk[]	= '<';
				$tk_s[]	= '<';
			}
			break;
			
		case '/':
			if ($n_c == '/')
			{
				$tk[]	= "em";
				$tk_s[]	= "//";
				$i+=1;
			}
			else 
			{
				$tk[]   = $c;
				$tk_s[] = $c;
			}
			break;
			
		case '*':
			if ($n_c == '*')
			{
				$tk[]	= "strong";
				$tk_s[]	= "**";
				$i+=1;
			}
			else 
			{
				$tk[]   = $c;
				$tk_s[] = $c;
			}
			break;
			
		case '^':
			if ($n_c == '^')
			{
				$tk[]	= "sup";
				$tk_s[]	= "^^";
				$i++;
			}
			else 
			{
				$tk[]   = $c;
				$tk_s[] = $c;
			}
			break;
			
		case '@':
		case '#':
		case '(':
		case ')':
		case '|':
		case '[':
		case ']':
		case '{':
		case '}':
			if ($c == '[' & $n_c == '[')
			{
				// check for block-level [[toc]]
				if (	$line_offs == 1
					&&	substr($s, $i, 8) == "[[toc]]\n")
				{
					$tk[]   = 'toc';
					$tk_s[] = '[[toc]]';
					$i     += 6;
				}
				else
				{
					$tk[]   = $c.$c;
					$tk_s[] = $c.$c;
					$i++;
				}
			}
			else if ($n_c == $c)
			{
				$tk[]   = $c.$c;
				$tk_s[] = $c.$c;
				$i++;
			}
			else
			{
				$tk[] 	= $c;
				$tk_s[]	= $c;
			}
			break;
		
		case '>':
			$tok = '>';
			while ($s{$i+1} == '>')
			{
				$tok .= '>';
				$i++;
			}
			$tk[]	= ">";
			$tk_s[]	= $tok;
			break;
			
		case '\'':
			if ($n_c == '\'' && $nn_c == '\'')
			{
				$tk[]	= "strong";
				$tk_s[]	= "'''";
				$i+=2;
			}
			else if ($n_c == '\'')
			{
				$tk[]	= "em";
				$tk_s[]	= "''";
				$i+=1;
			}
			else 
			{
				$tk[]   = $c;
				$tk_s[] = $c;
			}
			break;
			
		case ':':
			if ($n_c == '/' && $nn_c == '/')
			{
				$tk[]   = '://';
				$tk_s[] = '://';
				$i += 2;
			}
			else
			{
				$tk[]   = ':';
				$tk_s[] = ':';
			}
			break;
			
		default:
			$class	= _charclass($c);
			$tok	= $c;
			$j		= $i;
			while ($class == _charclass($s{$j+1}) && $j < $len - 2)
			{
				$j++;
				$tok .= $s{$j};
			}
			
			if ($class == 'word')
			{
				if (	(($tok == 'http' || $tok == 'https') && substr($s, $j+1, 3) == '://')
					||	($tok == 'mailto' && $s[$j+1] == ':'))
				{
					// http://  or   mailto: -- fetch till whitespace or one of "])|>"
					if ($tok == 'mailto')
					{
						$class = 'mailto';
					}
					else
					{
						$class = 'url';
					}
					
					while (strpos("\n\t |[](){}<>\"'", $s{$j+1}) === false)
					{
						$j++;
						$tok .= $s{$j};
					}
				}
				else if (	$allow_wikiword
						&&	$c >= 'A' 
						&&	$c <= 'Z'
						&&	preg_match('/^[A-Z][a-z0-9_]+[A-Z][a-zA-Z0-9_]*$/', $tok))
				{
					$class = "wiki-word";
				}
			}
			$tk[]	= $class;
			$tk_s[]	= $tok;
			
			$i = $j;
			break;
		}
		$i++;
	}
	
	$tk[]   = 'end';
	$tk_s[] = '';
	
	return array($tk, $tk_s);
}