Exemplo n.º 1
0
function obgrabArticle($html, $with_title = true)
{
    // Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
    $html = preg_replace('!<br ?/?>[ \\r\\n\\s]*<br ?/?>!', '</p><p>', $html);
    $html = preg_replace('!</?font[^>]*>!', '', $html);
    $document = new DOMDocument();
    $html = @mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
    @$document->loadHTML($html);
    $allParagraphs = $document->getElementsByTagName('p');
    $topDivCount = 0;
    $topDiv = null;
    $topDivParas;
    $articleContent = $document->createElement('div');
    if ($with_title) {
        $articleTitle = $document->createElement('h1');
        // Grab the title from the <title> tag and inject it as the title.
        //var_dump($document->getElementsByTagName('title')->item(0)->nodeValue);exit;
        $title = $document->getElementsByTagName('title');
        if ($title->length > 0) {
            $title = $title->item(0)->nodeValue;
        } else {
            $title = 'None title';
        }
        $articleTitle->appendChild($document->createTextNode($title));
        $articleContent->appendChild($articleTitle);
    }
    // Study all the paragraphs and find the chunk that has the best score.
    // A score is determined by things like: Number of <p>'s, commas, special classes, etc.
    for ($j = 0; $j < $allParagraphs->length; $j++) {
        $parentNode = $allParagraphs->item($j)->parentNode;
        // Initialize readability data
        if (!$parentNode->hasAttribute('readability')) {
            $readability = $document->createAttribute('readability');
            $readability->value = 0;
            $parentNode->appendChild($readability);
            // Look for a special classname
            if (classNameMatch($parentNode, '/(comment|meta|footer|footnote)/')) {
                $readability->value -= 50;
            } else {
                if (classNameMatch($parentNode, '/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/')) {
                    $readability->value += 25;
                }
            }
            // Look for a special ID
            if (preg_match('/(comment|meta|footer|footnote)/', $parentNode->getAttribute('id'))) {
                $readability->value -= 50;
            } else {
                if (preg_match('/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/', $parentNode->getAttribute('id'))) {
                    $readability->value += 25;
                }
            }
        } else {
            $readability = $parentNode->getAttributeNode('readability');
        }
        // Add a point for the paragraph found
        if (strlen($allParagraphs->item($j)->textContent) > 10) {
            $readability->value++;
        }
        // Add points for any commas within this paragraph
        $readability->value += substr_count($allParagraphs->item($j)->textContent, ',');
    }
    //exit();
    // Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
    $allElements = $document->getElementsByTagName('*');
    $topDiv = null;
    foreach ($allElements as $node) {
        if ($node->hasAttribute('readability') && ($topDiv == null || (int) $node->getAttribute('readability') > (int) $topDiv->getAttribute('readability'))) {
            $topDiv = $node;
        }
    }
    //var_dump($topDiv);exit('ag');
    if ($topDiv == null) {
        //$topDiv = $document->createElement('div', 'Content error!');
        $res = array('', "Can't auto get fulltext. please use Parser Code!");
    } else {
        $topDiv->removeAttribute("style");
        cleanStyles($topDiv);
        // Removes all style attributes
        $topDiv = killBreaks($topDiv);
        // Removes any consecutive <br />'s into just one <br />
        // Cleans out junk from the topDiv just in case:
        $topDiv = clean($topDiv, 'form');
        $topDiv = clean($topDiv, 'object');
        //$topDiv = clean($topDiv, 'table', 250);
        $topDiv = clean($topDiv, 'h1');
        //$topDiv = clean($topDiv, 'h2');
        $topDiv = clean($topDiv, 'iframe');
        $topDiv = clean($topDiv, 'script');
        $articleContent->appendChild($topDiv);
        $html = $articleContent->ownerDocument->saveXML($articleContent->lastChild);
        $res = array($html);
    }
    return $res;
}
function grabArticle($html)
{
    // Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
    $html = preg_replace('!<br ?/?>[ \\r\\n\\s]*<br ?/?>!', '</p><p>', $html);
    $html = preg_replace('!</?font[^>]*>!', '', $html);
    $document = new DOMDocument();
    $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
    @$document->loadHTML($html);
    $allParagraphs = $document->getElementsByTagName('p');
    $topDivCount = 0;
    $topDiv = null;
    $topDivParas;
    $articleContent = $document->createElement('div');
    // Study all the paragraphs and find the chunk that has the best score.
    // A score is determined by things like: Number of <p>'s, commas, special classes, etc.
    for ($j = 0; $j < $allParagraphs->length; $j++) {
        $parentNode = $allParagraphs->item($j)->parentNode;
        // Initialize readability data
        if (!$parentNode->hasAttribute('readability')) {
            $readability = $document->createAttribute('readability');
            $readability->value = 0;
            $parentNode->appendChild($readability);
            // Look for a special classname
            if ($parentNode->hasAttribute('class') && $parentNode->getAttribute('class') != '') {
                if (preg_match('/combx|comment|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/', $parentNode->getAttribute('class'))) {
                    $readability->value -= 50;
                } else {
                    if (preg_match('/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/', $parentNode->getAttribute('class'))) {
                        $readability->value += 25;
                    }
                }
            }
            // Look for a special ID
            if ($parentNode->hasAttribute('id') && $parentNode->getAttribute('id') != '') {
                if (preg_match('/(combx|comment|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup)/', $parentNode->getAttribute('id'))) {
                    $readability->value -= 50;
                } else {
                    if (preg_match('/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/', $parentNode->getAttribute('id'))) {
                        $readability->value += 25;
                    }
                }
            }
        } else {
            $readability = $parentNode->getAttributeNode('readability');
        }
        // Add a point for the paragraph found
        if (strlen($allParagraphs->item($j)->textContent) > 10) {
            $readability->value++;
        }
        // Add points for any commas within this paragraph
        $readability->value += substr_count($allParagraphs->item($j)->textContent, ',');
    }
    // Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
    $allElements = $document->getElementsByTagName('*');
    $topDiv = null;
    foreach ($allElements as $node) {
        if ($node->hasAttribute('readability') && ($topDiv == null || (int) $node->getAttribute('readability') > (int) $topDiv->getAttribute('readability'))) {
            $topDiv = $node;
        }
    }
    if ($topDiv == null) {
        $topDiv = $document->createElement('div', 'Sorry, readability was unable to parse this page for content.');
    } else {
        cleanStyles($topDiv);
        // Removes all style attributes
        $topDiv = killDivs($topDiv);
        // Goes in and removes DIV's that have more non <p> stuff than <p> stuff
        $topDiv = killBreaks($topDiv);
        // Removes any consecutive <br />'s into just one <br />
        // Cleans out junk from the topDiv just in case:
        $topDiv = clean($topDiv, 'form');
        $topDiv = clean($topDiv, 'object');
        $topDiv = clean($topDiv, 'table', 250);
        $topDiv = clean($topDiv, 'h1');
        //$topDiv = clean($topDiv, 'h2');
        $topDiv = clean($topDiv, 'iframe');
        $topDiv = clean($topDiv, 'script');
    }
    $articleContent->appendChild($topDiv);
    return $articleContent;
}