function parse_dom_contents($doc_contents = '', $doc_type = 'html')
{
    $dom_contents = [];
    if ($doc_type == 'html') {
        //-----------------parse html contents---------------
        $dom_contents['html:head'] = '';
        $dom_contents['html:links'] = '';
        $dom_contents['html:scripts'] = '';
        $dom_contents['html:styles'] = '';
        $dom_contents['html:body'] = '';
        //-----------------parse doc_contents---------------
        $dom = new DOMDocument();
        $dom->loadHTML($doc_contents);
        //----------parse head---------
        $head = $dom->getElementsByTagName('head');
        if ($head && 0 < $head->length) {
            $dom_contents['html:head'] = strip_single_tag($dom->savehtml($head->item(0)), 'head');
        }
        //----------parse link---------
        $links = $dom->getElementsByTagName('link');
        if ($links && 0 < $links->length) {
            foreach ($links as $i => $link) {
                if ($link->hasAttributes() && ($url = $link->getAttribute('href'))) {
                    foreach ($link->attributes as $attr) {
                        $name = $attr->nodeName;
                        $value = $attr->nodeValue;
                        $dom_contents['html:links'][$url][$name] = $value;
                    }
                    $dom_contents['html:links'][$url]['string'] = custom_trim($dom->savehtml($links->item($i)));
                }
            }
        }
        //----------parse script---------
        $scripts = $dom->getElementsByTagName('script');
        if ($scripts && 0 < $scripts->length) {
            foreach ($scripts as $i => $script) {
                if ($script->hasAttributes() && ($url = $script->getAttribute('src'))) {
                    foreach ($script->attributes as $attr) {
                        $name = $attr->nodeName;
                        $value = $attr->nodeValue;
                        $dom_contents['html:scripts'][$url][$name] = $value;
                    }
                    $dom_contents['html:scripts'][$url]['string'] = custom_trim($dom->savehtml($scripts->item($i)));
                }
            }
        }
        //----------parse style---------
        $styles = $dom->getElementsByTagName('style');
        if ($styles && 0 < $styles->length) {
            foreach ($styles as $i => $style) {
                if ($style->hasAttributes()) {
                    foreach ($style->attributes as $attr) {
                        $name = $attr->nodeName;
                        $value = $attr->nodeValue;
                        $dom_contents['html:styles'][$i][$name] = $value;
                    }
                }
                $dom_contents['html:styles'][$i]['string'] = strip_single_tag(custom_trim($dom->savehtml($styles->item($i))), 'style');
            }
        }
        //----------parse body---------
        $body = $dom->getElementsByTagName('body');
        if ($body && 0 < $body->length) {
            $dom_contents['html:body'] = strip_single_tag($dom->savehtml($body->item(0)), 'body');
        }
    } elseif ($doc_type == 'css') {
        $dom_contents['css'] = parse_css_selectors($doc_contents);
    }
    return $dom_contents;
}
Example #2
0
/**
 * Limit a given htmltext to a given number of characters without removing html tags
 * @param $htmlText
 * @param int $limit
 */
function limitHTMLText($htmlText, $limit = 350, $removeLinkElements = true)
{
    //First removing videos, it can cause problems.
    $iframesMatched = [];
    $regex = '(<iframe .*\\/iframe>)';
    preg_match($regex, $htmlText, $iframesMatched);
    // We're going to save the first to put it as an article header
    $htmlText = preg_replace($regex, '', $htmlText);
    $htmlText = preg_replace('(videodetector)', '', $htmlText);
    if (count($iframesMatched) > 0) {
        $iframesMatched = $iframesMatched[0];
    } else {
        $iframesMatched = "";
    }
    if ($removeLinkElements) {
        $htmlText = strip_single_tag($htmlText, "a");
    }
    $str = substr($htmlText, 0, $limit);
    //$limit first chars of $htmlText, tags included
    $strWithoutHTMLTags = strip_tags($str);
    //Removing tags, calculating length
    $i = strlen($str);
    while (strlen($strWithoutHTMLTags) < $limit && $i < strlen($htmlText)) {
        //If length not enough and if there is still some text to add : adding chars
        $str .= $htmlText[$i];
        $strWithoutHTMLTags = strip_tags($str);
        //Removing tags for calculating length of the text only
        $i++;
    }
    $str .= "...";
    $str = closetags($str);
    $str = "<div class='videodetector'>" . $iframesMatched . "</div>" . $str;
    return $str;
}