/** * Tries to convert the given HTML into a plain text format - best suited for * e-mail display, etc. * * <p>In particular, it tries to maintain the following features: * <ul> * <li>Links are maintained, with the 'href' copied over * <li>Information in the <head> is lost * </ul> * * @param html the input HTML * @return the HTML converted, as best as possible, to text */ function convert_html_to_text($html, $width = 74) { $html = fix_newlines($html); $doc = new DOMDocument('1.0', 'utf-8'); if (strpos($html, '<?xml ') === false) { $html = '<?xml encoding="utf-8"?>' . $html; } # <?php (4vim) if (!@$doc->loadHTML($html)) { return $html; } // Thanks, http://us3.php.net/manual/en/domdocument.loadhtml.php#95251 // dirty fix -- remove the inserted processing instruction foreach ($doc->childNodes as $item) { if ($item->nodeType == XML_PI_NODE) { $doc->removeChild($item); // remove hack break; } } $elements = identify_node($doc); // Add the default stylesheet $elements->getRoot()->addStylesheet(HtmlStylesheet::fromArray(array('html' => array('white-space' => 'pre'), 'p' => array('margin-bottom' => '1em'), 'pre' => array('white-space' => 'pre')))); $options = array(); if (is_object($elements)) { $output = $elements->render($width, $options); } else { $output = $elements; } return trim($output); }
/** * Tries to convert the given HTML into a plain text format - best suited for * e-mail display, etc. * * <p>In particular, it tries to maintain the following features: * <ul> * <li>Links are maintained, with the 'href' copied over * <li>Information in the <head> is lost * </ul> * * @param string html the input HTML * @return string the HTML converted, as best as possible, to text * @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument} */ function convert_html_to_text($html) { $html = fix_newlines($html); $doc = new DOMDocument(); if (!$doc->loadHTML($html)) { throw new Html2TextException("Could not load HTML - badly formed?", $html); } $output = iterate_over_node($doc); // remove leading and trailing spaces on each line $output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output); // remove leading and trailing whitespace $output = trim($output); return $output; }
$passed = 0; $failed = 0; foreach ($tests as $test) { echo "[test {$test}]\n"; if (!file_exists(__DIR__ . "/{$test}.html")) { echo "FAILED: File '{$test}.html' did not exist\n\n"; $failed++; continue; } if (!file_exists(__DIR__ . "/{$test}.txt")) { echo "FAILED: File '{$test}.txt' did not exist\n\n"; $failed++; continue; } $input = file_get_contents(__DIR__ . "/{$test}.html"); $expected = fix_newlines(file_get_contents(__DIR__ . "/{$test}.txt")); $output = convert_html_to_text($input); if ($output != $expected) { file_put_contents(__DIR__ . "/{$test}.output", $output); // mark whitespace /* $output = str_replace(" ", ".", $output); $expected = str_replace(" ", ".", $expected); $output = str_replace("\t", " -> ", $output); $expected = str_replace("\t", " -> ", $expected); $output = str_replace("\r", "\\r\r", $output); $expected = str_replace("\r", "\\r\r", $expected); $output = str_replace("\n", "\\n\n", $output); $expected = str_replace("\n", "\\n\n", $expected); */ echo "FAILED: Expected:\n[{$expected}]\n\nGot:\n[{$output}]\n\n";