/** * \brief Parsed HTML tree walker( used by HTML sucker ) * * This is initial implementation( stupid... w/o any intellegence( almost : ) ) * It is rapidly designed version... just for test: 'can this feature be useful'. * Later it should be replaced by well designed one :) don't bash me now : ) * * \param &$c array -- parsed HTML * \param &$src string -- output string * \param &$p array -- ['stack'] = closing strings stack, ['listack'] = stack of list types currently opened ['first_td'] = flag: 'is <tr> was just before this <td>' */ function walk_and_parse(&$c, &$src, &$p) { for ($i = 0; $i <= $c["contentpos"]; $i++) { // If content type 'text' output it to destination... if ($c[$i]["type"] == "text") { $src .= $c[$i]["data"]; } elseif ($c[$i]["type"] == "tag") { if ($c[$i]["data"]["type"] == "open") { // Open tag type switch ($c[$i]["data"]["name"]) { case "br": $src .= "\n"; break; case "title": $src .= "\n!"; $p['stack'][] = array('tag' => 'title', 'string' => "\n"); break; case "p": $src .= "\n"; $p['stack'][] = array('tag' => 'p', 'string' => "\n"); break; case "b": $src .= '__'; $p['stack'][] = array('tag' => 'b', 'string' => '__'); break; case "i": $src .= "''"; $p['stack'][] = array('tag' => 'i', 'string' => "''"); break; case "u": $src .= "=="; $p['stack'][] = array('tag' => 'u', 'string' => "=="); break; case "center": $src .= '::'; $p['stack'][] = array('tag' => 'center', 'string' => '::'); break; case "code": $src .= '-+'; $p['stack'][] = array('tag' => 'code', 'string' => '+-'); break; // headers detection looks like real suxx code... // but possible it run faster :) I don't know where is profiler in PHP... // headers detection looks like real suxx code... // but possible it run faster :) I don't know where is profiler in PHP... case "h1": $src .= "\n!"; $p['stack'][] = array('tag' => 'h1', 'string' => "\n"); break; case "h2": $src .= "\n!!"; $p['stack'][] = array('tag' => 'h2', 'string' => "\n"); break; case "h3": $src .= "\n!!!"; $p['stack'][] = array('tag' => 'h3', 'string' => "\n"); break; case "h4": $src .= "\n!!!!"; $p['stack'][] = array('tag' => 'h4', 'string' => "\n"); break; case "h5": $src .= "\n!!!!!"; $p['stack'][] = array('tag' => 'h5', 'string' => "\n"); break; case "h6": $src .= "\n!!!!!!"; $p['stack'][] = array('tag' => 'h6', 'string' => "\n"); break; case "pre": $src .= '~pp~'; $p['stack'][] = array('tag' => 'pre', 'string' => '~/pp~'); break; // Table parser // Table parser case "table": $src .= '||'; $p['stack'][] = array('tag' => 'table', 'string' => '||'); break; case "tr": $p['first_td'] = true; break; case "td": $src .= $p['first_td'] ? '' : '|'; $p['first_td'] = false; break; // Lists parser // Lists parser case "ul": $p['listack'][] = '*'; break; case "ol": $p['listack'][] = '#'; break; case "li": // Generate wiki list item according to current list depth. //( ensure '*/#' starts from begining of line ) for ($l = ''; strlen($l) < count($p['listack']); $l .= end($p['listack'])) { } $src .= "\n{$l} "; break; case "font": // If color attribute present in <font> tag if (isset($c[$i]["pars"]["color"]["value"])) { $src .= '~~' . $c[$i]["pars"]["color"]["value"] . ':'; $p['stack'][] = array('tag' => 'font', 'string' => '~~'); } break; case "img": // If src attribute present in <img> tag if (isset($c[$i]["pars"]["src"]["value"])) { // Note what it produce( img ) not {img}! Will fix this below... $src .= '( img src=' . $c[$i]["pars"]["src"]["value"] . ' )'; } break; case "a": // If href attribute present in <a> tag if (isset($c[$i]["pars"]["href"]["value"])) { $src .= '[' . $c[$i]["pars"]["href"]["value"] . '|'; $p['stack'][] = array('tag' => 'a', 'string' => ']'); } break; } } else { // This is close tag type. Is that smth we r waiting for? switch ($c[$i]["data"]["name"]) { case "ul": if (end($p['listack']) == '*') { array_pop($p['listack']); } break; case "ol": if (end($p['listack']) == '#') { array_pop($p['listack']); } break; default: $e = end($p['stack']); if ($c[$i]["data"]["name"] == $e['tag']) { $src .= $e['string']; array_pop($p['stack']); } break; } } } // Recursive call on tags with content... if (isset($c[$i]["content"])) { // if( substr( $src, -1 )!= " " )$src .= " "; walk_and_parse($c[$i]["content"], $src, $p); } } }
/** * wrapper around zaufi's HTML sucker code just to use the html to wiki bit * * \param &$c string -- HTML in * \param &$src string -- output string */ function parse_html(&$inHtml) { //error_reporting(6143); // Read compiled (serialized) grammar $grammarfile = 'lib/htmlparser/htmlgrammar.cmp'; if (!($fp = @fopen($grammarfile, 'r'))) { $smarty->assign('msg', tra("Can't parse HTML data - no grammar file")); $smarty->display("error.tpl"); die; } $grammar = unserialize(fread($fp, filesize($grammarfile))); fclose($fp); // create parser object, insert html code and parse it $htmlparser = new HtmlParser($inHtml, $grammar, '', 0); $htmlparser->Parse(); // Should I try to convert HTML to wiki? $out_data = ''; $p = array('stack' => array(), 'listack' => array(), 'first_td' => false, 'first_tr' => false); walk_and_parse($htmlparser->content, $out_data, $p, ''); // Is some tags still opened? (It can be if HTML not valid, but this is not reason // to produce invalid wiki :) while (count($p['stack'])) { $e = end($p['stack']); $out_data .= $e['string']; array_pop($p['stack']); } // Unclosed lists r ignored... wiki have no special start/end lists syntax.... // OK. Things remains to do: // 1) fix linked images $out_data = preg_replace(',\\[(.*)\\|\\(img src=(.*)\\)\\],mU', '{img src=$2 link=$1}', $out_data); // 2) fix remains images (not in links) $out_data = preg_replace(',\\(img src=(.*)\\),mU', '{img src=$1}', $out_data); // 3) remove empty lines $out_data = preg_replace(",[\n]+,mU", "\n", $out_data); // 4) remove nbsp's $out_data = preg_replace(", ,mU", " ", $out_data); return $out_data; }