Пример #1
0
function iterate_over_node($node)
{
    if ($node instanceof DOMText) {
        return preg_replace("/\\s+/im", " ", $node->wholeText);
    }
    if ($node instanceof DOMDocumentType) {
        // ignore
        return "";
    }
    $nextName = next_child_name($node);
    $prevName = prev_child_name($node);
    $name = strtolower($node->nodeName);
    // start whitespace
    switch ($name) {
        case "hr":
            return "------\n";
        case "style":
        case "head":
        case "title":
        case "meta":
        case "script":
            // ignore these tags
            return "";
        case "h1":
        case "h2":
        case "h3":
        case "h4":
        case "h5":
        case "h6":
            // add two newlines
            $output = "\n";
            break;
        case "p":
        case "div":
            // add one line
            $output = "\n";
            break;
        default:
            // print out contents of unknown tags
            $output = "";
            break;
    }
    // debug
    //$output .= "[$name,$nextName]";
    for ($i = 0; $i < $node->childNodes->length; $i++) {
        $n = $node->childNodes->item($i);
        $text = iterate_over_node($n);
        $output .= $text;
    }
    // end whitespace
    switch ($name) {
        case "style":
        case "head":
        case "title":
        case "meta":
        case "script":
            // ignore these tags
            return "";
        case "h1":
        case "h2":
        case "h3":
        case "h4":
        case "h5":
        case "h6":
            $output .= "\n";
            break;
        case "p":
        case "br":
            // add one line
            if ($nextName != "div") {
                $output .= "\n";
            }
            break;
        case "div":
            // add one line only if the next child isn't a div
            if ($nextName != "div" && $nextName != null) {
                $output .= "\n";
            }
            break;
        case "a":
            // links are returned in [text](link) format
            $href = $node->getAttribute("href");
            if ($href == null) {
                // it doesn't link anywhere
                if ($node->getAttribute("name") != null) {
                    $output = "{$output}";
                }
            } else {
                if ($href == $output) {
                    // link to the same address: just use link
                    $output;
                } else {
                    // replace it
                    $output = "{$output} ({$href})";
                }
            }
            // does the next node require additional whitespace?
            switch ($nextName) {
                case "h1":
                case "h2":
                case "h3":
                case "h4":
                case "h5":
                case "h6":
                    $output .= "\n";
                    break;
            }
        default:
            // do nothing
    }
    return $output;
}
function iterate_over_node($node)
{
    if ($node instanceof DOMText) {
        return preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
    }
    if ($node instanceof DOMDocumentType) {
        // ignore
        return "";
    }
    $nextName = next_child_name($node);
    $prevName = prev_child_name($node);
    $name = strtolower($node->nodeName);
    // start whitespace
    switch ($name) {
        case "hr":
            return "------\n";
        case "style":
        case "head":
        case "title":
        case "meta":
        case "script":
            // ignore these tags
            return "";
        case "h1":
        case "h2":
        case "h3":
        case "h4":
        case "h5":
        case "h6":
        case "ol":
        case "ul":
            // add two newlines, second line is added below
            $output = "\n";
            break;
        case "td":
        case "th":
            // add tab char to separate table fields
            $output = "\t";
            break;
        case "tr":
        case "p":
        case "div":
            // add one line
            $output = "\n";
            break;
        case "li":
            $output = "- ";
            break;
        case "b":
        case "strong":
            $output = "**";
            break;
        case "i":
        case "em":
            $output = "_";
            break;
        case "del":
            $output = "~~";
            break;
        case "code":
            $output = "`";
            break;
        default:
            // print out contents of unknown tags
            $output = "";
            break;
    }
    // debug
    //$output .= "[$name,$nextName]";
    if (isset($node->childNodes)) {
        for ($i = 0; $i < $node->childNodes->length; $i++) {
            $n = $node->childNodes->item($i);
            $text = iterate_over_node($n);
            $output .= $text;
        }
    }
    // end whitespace
    switch ($name) {
        case "h1":
        case "h2":
        case "h3":
        case "h4":
        case "h5":
        case "h6":
            $output .= "\n";
            break;
        case "p":
        case "br":
            // add one line
            if ($nextName != "div") {
                $output .= "\n";
            }
            break;
        case "div":
            // add one line only if the next child isn't a div
            if ($nextName != "div" && $nextName != null) {
                $output .= "\n";
            }
            break;
        case "a":
            // links are returned in [text](link) format
            $href = $node->getAttribute("href");
            $output = trim($output);
            // remove double [[ ]] s from linking images
            if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
                $output = substr($output, 1, strlen($output) - 2);
                // for linking images, the title of the <a> overrides the title of the <img>
                if ($node->getAttribute("title")) {
                    $output = $node->getAttribute("title");
                }
            }
            // if there is no link text, but a title attr
            if (!$output && $node->getAttribute("title")) {
                $output = $node->getAttribute("title");
            }
            if ($href == null) {
                // it doesn't link anywhere
                if ($node->getAttribute("name") != null) {
                    $output = "[{$output}]";
                }
            } else {
                if ($href == $output || $href == "mailto:{$output}" || $href == "http://{$output}" || $href == "https://{$output}") {
                    // link to the same address: just use link
                    $output;
                } else {
                    // replace it
                    if ($output) {
                        $output = "[{$output}]({$href})";
                    } else {
                        // empty string
                        $output = $href;
                    }
                }
            }
            // does the next node require additional whitespace?
            switch ($nextName) {
                case "h1":
                case "h2":
                case "h3":
                case "h4":
                case "h5":
                case "h6":
                    $output .= "\n";
                    break;
            }
            break;
            // mod by r-a-y
        // mod by r-a-y
        case "img":
            $alt = $node->getAttribute("alt");
            $src = $node->getAttribute("src");
            if (!empty($alt)) {
                $output = "![Image - {$alt}]";
            } else {
                $output = "![Image]";
            }
            $output .= "({$src})";
            break;
        case "li":
            $output .= "\n";
            break;
        case "b":
        case "strong":
            $output .= "**";
            break;
        case "i":
        case "em":
            $output .= "_";
            break;
        case "del":
            $output .= "~~";
            break;
        case "code":
            $output .= "`";
            break;
        default:
            // do nothing
    }
    return $output;
}