function html2xhtml($html)
{
    process_pagebreak_commands($html);
    // Remove SCRIPT tags from the page being processed, as script content may
    // mess the firther html-parsing utilities
    $html = process_script($html);
    // Remove STYLE tags for the same reason and store them in the temporary variable
    // later they will be added back to HEAD section
    $styles = process_style($html);
    // Convert HTML character references to their Unicode analogues
    process_character_references($html);
    remove_comments($html);
    fix_attrs_spaces($html);
    $html = quote_attrs($html);
    $html = escape_attrs_entities($html);
    $html = lowercase_tags($html);
    $html = lowercase_closing_tags($html);
    $html = fix_closing_tags($html);
    $html = close_tag("area", $html);
    $html = close_tag("base", $html);
    $html = close_tag("basefont", $html);
    $html = close_tag("br", $html);
    $html = close_tag("col", $html);
    $html = close_tag("embed", $html);
    $html = close_tag("frame", $html);
    $html = close_tag("hr", $html);
    $html = close_tag("img", $html);
    $html = close_tag("input", $html);
    $html = close_tag("isindex", $html);
    $html = close_tag("link", $html);
    $html = close_tag("meta", $html);
    $html = close_tag("param", $html);
    $html = make_attr_value("checked", $html);
    $html = make_attr_value("compact", $html);
    $html = make_attr_value("declare", $html);
    $html = make_attr_value("defer", $html);
    $html = make_attr_value("disabled", $html);
    $html = make_attr_value("ismap", $html);
    $html = make_attr_value("multiple", $html);
    $html = make_attr_value("nohref", $html);
    $html = make_attr_value("noresize", $html);
    $html = make_attr_value("noshade", $html);
    $html = make_attr_value("nowrap", $html);
    $html = make_attr_value("readonly", $html);
    $html = make_attr_value("selected", $html);
    $html = process_html($html);
    $html = process_body($html);
    $html = process_head($html);
    $html = process_p($html);
    $html = escape_amp($html);
    $html = escape_lt($html);
    $html = escape_gt($html);
    $html = escape_textarea_content($html);
    process_tables($html, 0);
    process_lists($html, 0);
    process_deflists($html, 0);
    process_selects($html, 0);
    $html = fix_tags($html);
    $html = fix_attrs($html);
    $html = insert_styles($html, $styles);
    return $html;
}
Exemple #2
0
function sn_submit($url)
{
    if ($url == "") {
        return False;
    }
    $url = get_redirected_url($url);
    if ($url === False) {
        privmsg("error: unable to download source (get_redirected_url)");
        return False;
    }
    $host = "";
    $uri = "";
    $port = 80;
    if (get_host_and_uri($url, $host, $uri, $port) == False) {
        privmsg("error: unable to download source (get_host_and_uri)");
        return False;
    }
    $response = wget($host, $uri, $port);
    if (get_host_and_uri($url, $host, $uri, $port) == False) {
        privmsg("error: unable to download source (wget)");
        return False;
    }
    $source_html = strip_headers($response);
    $source_title = extract_raw_tag($source_html, "title");
    $delimiters = array("--", "|", " - ", " : ", " — ", " • ");
    for ($i = 0; $i < count($delimiters); $i++) {
        $j = strpos($source_title, $delimiters[$i]);
        if ($j !== False) {
            $source_title = trim(substr($source_title, 0, $j));
        }
    }
    if ($source_title === False or $source_title == "") {
        privmsg("error: title not found or empty");
        return False;
    }
    $source_title = html_decode($source_title);
    $source_title = html_decode($source_title);
    $source_body = extract_meta_content($source_html, "description");
    if ($source_body === False or $source_body == "") {
        $source_body = extract_meta_content($source_html, "og:description", "property");
        if ($source_body === False or $source_body == "") {
            privmsg("error: description meta content not found or empty");
            return False;
        }
    }
    $html = $source_html;
    $article = extract_raw_tag($html, "article");
    if ($article !== False) {
        $html = $article;
    }
    strip_all_tag($html, "head");
    strip_all_tag($html, "script");
    strip_all_tag($html, "style");
    #strip_all_tag($html,"a");
    strip_all_tag($html, "strong");
    $html = strip_tags($html, "<p>");
    $html = lowercase_tags($html);
    $html = explode("<p", $html);
    $source_body = array();
    for ($i = 0; $i < count($html); $i++) {
        $parts = explode(">", $html[$i]);
        if (count($parts) >= 2) {
            array_shift($parts);
            $html[$i] = implode(">", $parts);
        }
        $html[$i] = strip_tags($html[$i]);
        $html[$i] = clean_text($html[$i]);
        $host_parts = explode(".", $host);
        for ($j = 0; $j < count($host_parts); $j++) {
            if (strlen($host_parts[$j]) > 3) {
                if (strpos(strtolower($html[$i]), strtolower($host_parts[$j])) !== False) {
                    continue 2;
                }
            }
        }
        if (filter($html[$i], "0123456789") != "") {
            continue;
        }
        if (strlen($html[$i]) > 1) {
            if ($html[$i][strlen($html[$i]) - 1] != ".") {
                continue;
            }
            while (True) {
                $j = strlen($html[$i]) - 1;
                if ($j < 0) {
                    break;
                }
                $c = $html[$i][$j];
                if ($c == ".") {
                    break;
                }
                $html[$i] = substr($html[$i], 0, $j);
            }
        }
        if (strlen($html[$i]) > 100) {
            $source_body[] = $html[$i];
        }
    }
    $source_body = implode("\n\n", $source_body);
    $source_body = html_decode($source_body);
    $source_body = html_decode($source_body);
    $host = "dev.soylentnews.org";
    $port = 443;
    $uri = "/submit.pl";
    $response = wget($host, $uri, $port, ICEWEASEL_UA);
    $html = strip_headers($response);
    $reskey = extract_text($html, "<input type=\"hidden\" id=\"reskey\" name=\"reskey\" value=\"", "\">");
    if ($reskey === False) {
        privmsg("error: unable to extract reskey");
        return False;
    }
    sleep(25);
    $params = array();
    $params["reskey"] = $reskey;
    #$params["name"]=trim(substr($nick,0,50));
    $params["name"] = get_bot_nick();
    $params["email"] = "";
    $params["subj"] = trim(substr($source_title, 0, 100));
    $params["primaryskid"] = "1";
    $params["tid"] = "6";
    $params["sub_type"] = "plain";
    $params["story"] = $source_body . "\n\n" . $url . "\n\n-- submitted from IRC";
    $params["op"] = "SubmitStory";
    $response = wpost($host, $uri, $port, ICEWEASEL_UA, $params);
    $html = strip_headers($response);
    strip_all_tag($html, "head");
    strip_all_tag($html, "script");
    strip_all_tag($html, "style");
    strip_all_tag($html, "a");
    $html = strip_tags($html);
    $html = clean_text($html);
    if (strpos($html, "Perhaps you would like to enter an email address or a URL next time. Thanks for the submission.") !== False) {
        privmsg("submission successful - https://{$host}/submit.pl?op=list");
        return True;
    } else {
        privmsg("error: something went wrong with your submission");
        return False;
    }
}