Ejemplo n.º 1
0
/**
 * Tries to convert the given HTML into a plain text format - best suited for
 * e-mail display, etc.
 *
 * <p>In particular, it tries to maintain the following features:
 * <ul>
 *   <li>Links are maintained, with the 'href' copied over
 *   <li>Information in the &lt;head&gt; is lost
 * </ul>
 *
 * @param html the input HTML
 * @return the HTML converted, as best as possible, to text
 */
function convert_html_to_text($html, $width = 74)
{
    $html = fix_newlines($html);
    $doc = new DOMDocument('1.0', 'utf-8');
    if (strpos($html, '<?xml ') === false) {
        $html = '<?xml encoding="utf-8"?>' . $html;
    }
    # <?php (4vim)
    if (!@$doc->loadHTML($html)) {
        return $html;
    }
    // Thanks, http://us3.php.net/manual/en/domdocument.loadhtml.php#95251
    // dirty fix -- remove the inserted processing instruction
    foreach ($doc->childNodes as $item) {
        if ($item->nodeType == XML_PI_NODE) {
            $doc->removeChild($item);
            // remove hack
            break;
        }
    }
    $elements = identify_node($doc);
    // Add the default stylesheet
    $elements->getRoot()->addStylesheet(HtmlStylesheet::fromArray(array('html' => array('white-space' => 'pre'), 'p' => array('margin-bottom' => '1em'), 'pre' => array('white-space' => 'pre'))));
    $options = array();
    if (is_object($elements)) {
        $output = $elements->render($width, $options);
    } else {
        $output = $elements;
    }
    return trim($output);
}
Ejemplo n.º 2
0
/**
 * Tries to convert the given HTML into a plain text format - best suited for
 * e-mail display, etc.
 *
 * <p>In particular, it tries to maintain the following features:
 * <ul>
 *   <li>Links are maintained, with the 'href' copied over
 *   <li>Information in the &lt;head&gt; is lost
 * </ul>
 *
 * @param string html the input HTML
 * @return string the HTML converted, as best as possible, to text
 * @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument}
 */
function convert_html_to_text($html)
{
    $html = fix_newlines($html);
    $doc = new DOMDocument();
    if (!$doc->loadHTML($html)) {
        throw new Html2TextException("Could not load HTML - badly formed?", $html);
    }
    $output = iterate_over_node($doc);
    // remove leading and trailing spaces on each line
    $output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
    // remove leading and trailing whitespace
    $output = trim($output);
    return $output;
}
Ejemplo n.º 3
0
$passed = 0;
$failed = 0;
foreach ($tests as $test) {
    echo "[test {$test}]\n";
    if (!file_exists(__DIR__ . "/{$test}.html")) {
        echo "FAILED: File '{$test}.html' did not exist\n\n";
        $failed++;
        continue;
    }
    if (!file_exists(__DIR__ . "/{$test}.txt")) {
        echo "FAILED: File '{$test}.txt' did not exist\n\n";
        $failed++;
        continue;
    }
    $input = file_get_contents(__DIR__ . "/{$test}.html");
    $expected = fix_newlines(file_get_contents(__DIR__ . "/{$test}.txt"));
    $output = convert_html_to_text($input);
    if ($output != $expected) {
        file_put_contents(__DIR__ . "/{$test}.output", $output);
        // mark whitespace
        /*
        $output = str_replace(" ", ".", $output);
        $expected = str_replace(" ", ".", $expected);
        $output = str_replace("\t", " -> ", $output);
        $expected = str_replace("\t", " -> ", $expected);
        $output = str_replace("\r", "\\r\r", $output);
        $expected = str_replace("\r", "\\r\r", $expected);
        $output = str_replace("\n", "\\n\n", $output);
        $expected = str_replace("\n", "\\n\n", $expected);
        */
        echo "FAILED: Expected:\n[{$expected}]\n\nGot:\n[{$output}]\n\n";