Example #1
0
/**
* \brief Parsed HTML tree walker( used by HTML sucker )
*
* This is initial implementation( stupid... w/o any intellegence( almost : ) )
* It is rapidly designed version... just for test: 'can this feature be useful'.
* Later it should be replaced by well designed one :) don't bash me now : )
*
* \param &$c array -- parsed HTML
* \param &$src string -- output string
* \param &$p array -- ['stack'] = closing strings stack,
					   ['listack'] = stack of list types currently opened
					   ['first_td'] = flag: 'is <tr> was just before this <td>'
*/
function walk_and_parse(&$c, &$src, &$p)
{
    for ($i = 0; $i <= $c["contentpos"]; $i++) {
        // If content type 'text' output it to destination...
        if ($c[$i]["type"] == "text") {
            $src .= $c[$i]["data"];
        } elseif ($c[$i]["type"] == "tag") {
            if ($c[$i]["data"]["type"] == "open") {
                // Open tag type
                switch ($c[$i]["data"]["name"]) {
                    case "br":
                        $src .= "\n";
                        break;
                    case "title":
                        $src .= "\n!";
                        $p['stack'][] = array('tag' => 'title', 'string' => "\n");
                        break;
                    case "p":
                        $src .= "\n";
                        $p['stack'][] = array('tag' => 'p', 'string' => "\n");
                        break;
                    case "b":
                        $src .= '__';
                        $p['stack'][] = array('tag' => 'b', 'string' => '__');
                        break;
                    case "i":
                        $src .= "''";
                        $p['stack'][] = array('tag' => 'i', 'string' => "''");
                        break;
                    case "u":
                        $src .= "==";
                        $p['stack'][] = array('tag' => 'u', 'string' => "==");
                        break;
                    case "center":
                        $src .= '::';
                        $p['stack'][] = array('tag' => 'center', 'string' => '::');
                        break;
                    case "code":
                        $src .= '-+';
                        $p['stack'][] = array('tag' => 'code', 'string' => '+-');
                        break;
                        // headers detection looks like real suxx code...
                        // but possible it run faster :) I don't know where is profiler in PHP...
                    // headers detection looks like real suxx code...
                    // but possible it run faster :) I don't know where is profiler in PHP...
                    case "h1":
                        $src .= "\n!";
                        $p['stack'][] = array('tag' => 'h1', 'string' => "\n");
                        break;
                    case "h2":
                        $src .= "\n!!";
                        $p['stack'][] = array('tag' => 'h2', 'string' => "\n");
                        break;
                    case "h3":
                        $src .= "\n!!!";
                        $p['stack'][] = array('tag' => 'h3', 'string' => "\n");
                        break;
                    case "h4":
                        $src .= "\n!!!!";
                        $p['stack'][] = array('tag' => 'h4', 'string' => "\n");
                        break;
                    case "h5":
                        $src .= "\n!!!!!";
                        $p['stack'][] = array('tag' => 'h5', 'string' => "\n");
                        break;
                    case "h6":
                        $src .= "\n!!!!!!";
                        $p['stack'][] = array('tag' => 'h6', 'string' => "\n");
                        break;
                    case "pre":
                        $src .= '~pp~';
                        $p['stack'][] = array('tag' => 'pre', 'string' => '~/pp~');
                        break;
                        // Table parser
                    // Table parser
                    case "table":
                        $src .= '||';
                        $p['stack'][] = array('tag' => 'table', 'string' => '||');
                        break;
                    case "tr":
                        $p['first_td'] = true;
                        break;
                    case "td":
                        $src .= $p['first_td'] ? '' : '|';
                        $p['first_td'] = false;
                        break;
                        // Lists parser
                    // Lists parser
                    case "ul":
                        $p['listack'][] = '*';
                        break;
                    case "ol":
                        $p['listack'][] = '#';
                        break;
                    case "li":
                        // Generate wiki list item according to current list depth.
                        //( ensure '*/#' starts from begining of line )
                        for ($l = ''; strlen($l) < count($p['listack']); $l .= end($p['listack'])) {
                        }
                        $src .= "\n{$l} ";
                        break;
                    case "font":
                        // If color attribute present in <font> tag
                        if (isset($c[$i]["pars"]["color"]["value"])) {
                            $src .= '~~' . $c[$i]["pars"]["color"]["value"] . ':';
                            $p['stack'][] = array('tag' => 'font', 'string' => '~~');
                        }
                        break;
                    case "img":
                        // If src attribute present in <img> tag
                        if (isset($c[$i]["pars"]["src"]["value"])) {
                            // Note what it produce( img ) not {img}! Will fix this below...
                            $src .= '( img src=' . $c[$i]["pars"]["src"]["value"] . ' )';
                        }
                        break;
                    case "a":
                        // If href attribute present in <a> tag
                        if (isset($c[$i]["pars"]["href"]["value"])) {
                            $src .= '[' . $c[$i]["pars"]["href"]["value"] . '|';
                            $p['stack'][] = array('tag' => 'a', 'string' => ']');
                        }
                        break;
                }
            } else {
                // This is close tag type. Is that smth we r waiting for?
                switch ($c[$i]["data"]["name"]) {
                    case "ul":
                        if (end($p['listack']) == '*') {
                            array_pop($p['listack']);
                        }
                        break;
                    case "ol":
                        if (end($p['listack']) == '#') {
                            array_pop($p['listack']);
                        }
                        break;
                    default:
                        $e = end($p['stack']);
                        if ($c[$i]["data"]["name"] == $e['tag']) {
                            $src .= $e['string'];
                            array_pop($p['stack']);
                        }
                        break;
                }
            }
        }
        // Recursive call on tags with content...
        if (isset($c[$i]["content"])) {
            //			if( substr( $src, -1 )!= " " )$src .= " ";
            walk_and_parse($c[$i]["content"], $src, $p);
        }
    }
}
/**
 * wrapper around zaufi's HTML sucker code just to use the html to wiki bit
 *
 * \param &$c string -- HTML in
 * \param &$src string -- output string
 */
function parse_html(&$inHtml)
{
    //error_reporting(6143);
    // Read compiled (serialized) grammar
    $grammarfile = 'lib/htmlparser/htmlgrammar.cmp';
    if (!($fp = @fopen($grammarfile, 'r'))) {
        $smarty->assign('msg', tra("Can't parse HTML data - no grammar file"));
        $smarty->display("error.tpl");
        die;
    }
    $grammar = unserialize(fread($fp, filesize($grammarfile)));
    fclose($fp);
    // create parser object, insert html code and parse it
    $htmlparser = new HtmlParser($inHtml, $grammar, '', 0);
    $htmlparser->Parse();
    // Should I try to convert HTML to wiki?
    $out_data = '';
    $p = array('stack' => array(), 'listack' => array(), 'first_td' => false, 'first_tr' => false);
    walk_and_parse($htmlparser->content, $out_data, $p, '');
    // Is some tags still opened? (It can be if HTML not valid, but this is not reason
    // to produce invalid wiki :)
    while (count($p['stack'])) {
        $e = end($p['stack']);
        $out_data .= $e['string'];
        array_pop($p['stack']);
    }
    // Unclosed lists r ignored... wiki have no special start/end lists syntax....
    // OK. Things remains to do:
    // 1) fix linked images
    $out_data = preg_replace(',\\[(.*)\\|\\(img src=(.*)\\)\\],mU', '{img src=$2 link=$1}', $out_data);
    // 2) fix remains images (not in links)
    $out_data = preg_replace(',\\(img src=(.*)\\),mU', '{img src=$1}', $out_data);
    // 3) remove empty lines
    $out_data = preg_replace(",[\n]+,mU", "\n", $out_data);
    // 4) remove nbsp's
    $out_data = preg_replace(",&#160;,mU", " ", $out_data);
    return $out_data;
}