function obgrabArticle($html, $with_title = true) { // Replace all doubled-up <BR> tags with <P> tags, and remove fonts. $html = preg_replace('!<br ?/?>[ \\r\\n\\s]*<br ?/?>!', '</p><p>', $html); $html = preg_replace('!</?font[^>]*>!', '', $html); $document = new DOMDocument(); $html = @mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); @$document->loadHTML($html); $allParagraphs = $document->getElementsByTagName('p'); $topDivCount = 0; $topDiv = null; $topDivParas; $articleContent = $document->createElement('div'); if ($with_title) { $articleTitle = $document->createElement('h1'); // Grab the title from the <title> tag and inject it as the title. //var_dump($document->getElementsByTagName('title')->item(0)->nodeValue);exit; $title = $document->getElementsByTagName('title'); if ($title->length > 0) { $title = $title->item(0)->nodeValue; } else { $title = 'None title'; } $articleTitle->appendChild($document->createTextNode($title)); $articleContent->appendChild($articleTitle); } // Study all the paragraphs and find the chunk that has the best score. // A score is determined by things like: Number of <p>'s, commas, special classes, etc. for ($j = 0; $j < $allParagraphs->length; $j++) { $parentNode = $allParagraphs->item($j)->parentNode; // Initialize readability data if (!$parentNode->hasAttribute('readability')) { $readability = $document->createAttribute('readability'); $readability->value = 0; $parentNode->appendChild($readability); // Look for a special classname if (classNameMatch($parentNode, '/(comment|meta|footer|footnote)/')) { $readability->value -= 50; } else { if (classNameMatch($parentNode, '/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/')) { $readability->value += 25; } } // Look for a special ID if (preg_match('/(comment|meta|footer|footnote)/', $parentNode->getAttribute('id'))) { $readability->value -= 50; } else { if (preg_match('/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/', $parentNode->getAttribute('id'))) { $readability->value += 25; } } } else { $readability = $parentNode->getAttributeNode('readability'); } // Add a point for the paragraph found if (strlen($allParagraphs->item($j)->textContent) > 10) { $readability->value++; } // Add points for any commas within this paragraph $readability->value += substr_count($allParagraphs->item($j)->textContent, ','); } //exit(); // Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 $allElements = $document->getElementsByTagName('*'); $topDiv = null; foreach ($allElements as $node) { if ($node->hasAttribute('readability') && ($topDiv == null || (int) $node->getAttribute('readability') > (int) $topDiv->getAttribute('readability'))) { $topDiv = $node; } } //var_dump($topDiv);exit('ag'); if ($topDiv == null) { //$topDiv = $document->createElement('div', 'Content error!'); $res = array('', "Can't auto get fulltext. please use Parser Code!"); } else { $topDiv->removeAttribute("style"); cleanStyles($topDiv); // Removes all style attributes $topDiv = killBreaks($topDiv); // Removes any consecutive <br />'s into just one <br /> // Cleans out junk from the topDiv just in case: $topDiv = clean($topDiv, 'form'); $topDiv = clean($topDiv, 'object'); //$topDiv = clean($topDiv, 'table', 250); $topDiv = clean($topDiv, 'h1'); //$topDiv = clean($topDiv, 'h2'); $topDiv = clean($topDiv, 'iframe'); $topDiv = clean($topDiv, 'script'); $articleContent->appendChild($topDiv); $html = $articleContent->ownerDocument->saveXML($articleContent->lastChild); $res = array($html); } return $res; }
function grabArticle($html) { // Replace all doubled-up <BR> tags with <P> tags, and remove fonts. $html = preg_replace('!<br ?/?>[ \\r\\n\\s]*<br ?/?>!', '</p><p>', $html); $html = preg_replace('!</?font[^>]*>!', '', $html); $document = new DOMDocument(); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); @$document->loadHTML($html); $allParagraphs = $document->getElementsByTagName('p'); $topDivCount = 0; $topDiv = null; $topDivParas; $articleContent = $document->createElement('div'); // Study all the paragraphs and find the chunk that has the best score. // A score is determined by things like: Number of <p>'s, commas, special classes, etc. for ($j = 0; $j < $allParagraphs->length; $j++) { $parentNode = $allParagraphs->item($j)->parentNode; // Initialize readability data if (!$parentNode->hasAttribute('readability')) { $readability = $document->createAttribute('readability'); $readability->value = 0; $parentNode->appendChild($readability); // Look for a special classname if ($parentNode->hasAttribute('class') && $parentNode->getAttribute('class') != '') { if (preg_match('/combx|comment|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/', $parentNode->getAttribute('class'))) { $readability->value -= 50; } else { if (preg_match('/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/', $parentNode->getAttribute('class'))) { $readability->value += 25; } } } // Look for a special ID if ($parentNode->hasAttribute('id') && $parentNode->getAttribute('id') != '') { if (preg_match('/(combx|comment|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup)/', $parentNode->getAttribute('id'))) { $readability->value -= 50; } else { if (preg_match('/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/', $parentNode->getAttribute('id'))) { $readability->value += 25; } } } } else { $readability = $parentNode->getAttributeNode('readability'); } // Add a point for the paragraph found if (strlen($allParagraphs->item($j)->textContent) > 10) { $readability->value++; } // Add points for any commas within this paragraph $readability->value += substr_count($allParagraphs->item($j)->textContent, ','); } // Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 $allElements = $document->getElementsByTagName('*'); $topDiv = null; foreach ($allElements as $node) { if ($node->hasAttribute('readability') && ($topDiv == null || (int) $node->getAttribute('readability') > (int) $topDiv->getAttribute('readability'))) { $topDiv = $node; } } if ($topDiv == null) { $topDiv = $document->createElement('div', 'Sorry, readability was unable to parse this page for content.'); } else { cleanStyles($topDiv); // Removes all style attributes $topDiv = killDivs($topDiv); // Goes in and removes DIV's that have more non <p> stuff than <p> stuff $topDiv = killBreaks($topDiv); // Removes any consecutive <br />'s into just one <br /> // Cleans out junk from the topDiv just in case: $topDiv = clean($topDiv, 'form'); $topDiv = clean($topDiv, 'object'); $topDiv = clean($topDiv, 'table', 250); $topDiv = clean($topDiv, 'h1'); //$topDiv = clean($topDiv, 'h2'); $topDiv = clean($topDiv, 'iframe'); $topDiv = clean($topDiv, 'script'); } $articleContent->appendChild($topDiv); return $articleContent; }