protected function grabArticle($page = null) { if (!$page) { $page = $this->dom; } $xpath = null; $nodesToScore = array(); if ($page instanceof DOMDocument && isset($page->documentElement)) { $xpath = new DOMXPath($page); } $allElements = $page->getElementsByTagName('*'); for ($nodeIndex = 0; $node = $allElements->item($nodeIndex); $nodeIndex++) { $tagName = $node->tagName; // Some well known site uses sections as paragraphs. if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'section') === 0) { $nodesToScore[] = $node; } // Turn divs into P tags where they have been used inappropriately // (as in, where they contain no other block level elements). if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) { if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { //$this->dbg('Altering '.$node->getNodePath().' to p'); $newNode = $this->dom->createElement('p'); try { $newNode->innerHTML = $node->innerHTML; // It's easier to debug using original attributes. //$newNode->setAttribute('class', $node->getAttribute('class')); //$newNode->setAttribute('id', $node->getAttribute('id')); $node = $node->parentNode->replaceChild($newNode, $node); $nodeIndex--; $nodesToScore[] = $newNode; } catch (Exception $e) { $this->dbg('Could not alter div/article to p, reverting back to div: ' . $e->getMessage()); } } else { // Will change these P elements back to text nodes after processing. for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { $childNode = $node->childNodes->item($i); if (is_object($childNode) && get_class($childNode) === 'DOMProcessingInstruction') { //executable tags (<?php or <?xml) warning $childNode->parentNode->removeChild($childNode); continue; } if ($childNode->nodeType == 3) { // XML_TEXT_NODE //$this->dbg('replacing text node with a P tag with the same content.'); $p = $this->dom->createElement('p'); $p->innerHTML = $childNode->nodeValue; $p->setAttribute('data-readability-styled', 'true'); $childNode->parentNode->replaceChild($p, $childNode); } } } } } /** * Loop through all paragraphs, and assign a score to them based on how content-y they look. * Then add their score to their parent node. * * A score is determined by things like number of commas, class names, etc. * Maybe eventually link density. **/ for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; $pt++) { $parentNode = $nodesToScore[$pt]->parentNode; // No parent node? Move on... if (!$parentNode) { continue; } $grandParentNode = $parentNode->parentNode instanceof DOMElement ? $parentNode->parentNode : null; $innerText = $this->getInnerText($nodesToScore[$pt]); // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it. if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) { continue; } // Initialize readability data for the parent. if (!$parentNode->hasAttribute('readability')) { $this->initializeNode($parentNode); $parentNode->setAttribute('data-candidate', 'true'); } // Initialize readability data for the grandparent. if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) { $this->initializeNode($grandParentNode); $grandParentNode->setAttribute('data-candidate', 'true'); } // Add a point for the paragraph itself as a base. $contentScore = 1; // Add points for any commas within this paragraph. $contentScore += $this->getCommaCount($innerText); // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points. $contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3); // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points. $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3); /* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/ $up = $nodesToScore[$pt]; $score = 0; while ($up->parentNode instanceof DOMElement) { $up = $up->parentNode; if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) { $score += 0.5; } else if (preg_match($this->regexps['negative'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) { $score -= 0.5; } } $score = floor($score); $contentScore += max(min($score, 3), -3);/**/ // Add the score to the parent. The grandparent gets half. $parentNode->getAttributeNode('readability')->value += $contentScore; if ($grandParentNode) { $grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR; } } /** * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc). * This is faster to do before scoring but safer after. */ if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) { $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement); for ($node = null, $c = $candidates->length - 1; $c >= 0; $c--) { $node = $candidates->item($c); // node should be readable but not inside of an article otherwise it's probably non-readable block if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) { $this->dbg('Removing unlikely candidate ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $node->parentNode->removeChild($node); } } $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement); for ($node = null, $c = $candidates->length - 1; $c >= 0; $c--) { $node = $candidates->item($c); $tagName = $node->tagName; /* Remove unlikely candidates */ $unlikelyMatchString = $node->getAttribute('class') . " " . $node->getAttribute('id') . " " . $node->getAttribute('style'); //$this->dbg('Processing '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int)$node->getAttributeNode('readability')->value : 0)); if (mb_strlen($unlikelyMatchString) > 3 && preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)) { $this->dbg('Removing unlikely candidate ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0)); $node->parentNode->removeChild($node); $nodeIndex--; } } unset($candidates); } /** * After we've calculated scores, loop through all of the possible candidate nodes we found * and find the one with the highest score. **/ $topCandidate = null; if ($xpath) { // Using array of DOMElements after deletion is a path to DOOMElement. $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement); for ($c = $candidates->length - 1; $c >= 0; $c--) { // Scale the final candidates score based on link density. Good content should have a // relatively small link density (5% or less) and be mostly unaffected by this operation. // If not for this we would have used XPath to find maximum @readability. $readability = $candidates->item($c)->getAttributeNode('readability'); $readability->value = round($readability->value * (1 - $this->getLinkDensity($candidates->item($c))), 0, PHP_ROUND_HALF_UP); if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) { $this->dbg('Candidate: ' . $candidates->item($c)->getNodePath() . ' (' . $candidates->item($c)->getAttribute('class') . ':' . $candidates->item($c)->getAttribute('id') . ') with score ' . $readability->value); $topCandidate = $candidates->item($c); } } unset($candidates); } /** * If we still have no top candidate, just use the body as a last resort. * We also have to copy the body node so it is something we can modify. **/ if ($topCandidate === null || strcasecmp($topCandidate->tagName, 'body') === 0) { $topCandidate = $this->dom->createElement('div'); if ($page instanceof DOMDocument) { if (!isset($page->documentElement)) { // we don't have a body either? what a mess! :) $this->dbg('The page has no body!'); } else { $this->dbg('Setting body to a raw HTML of original page!'); $topCandidate->innerHTML = $page->documentElement->innerHTML; $page->documentElement->innerHTML = ''; $page->documentElement->appendChild($topCandidate); } } else { $topCandidate->innerHTML = $page->innerHTML; $page->innerHTML = ''; $page->appendChild($topCandidate); } $this->initializeNode($topCandidate); } // Set table as the main node if resulted data is table element. $tagName = $topCandidate->tagName; if (strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'tr') === 0) { $up = $topCandidate; while ($up->parentNode instanceof DOMElement) { $up = $up->parentNode; if (strcasecmp($up->tagName, 'table') === 0) { $topCandidate = $up; break; } } } $this->dbg('Top candidate: ' . $topCandidate->getNodePath()); /** * Now that we have the top candidate, look through its siblings for content that might also be related. * Things like preambles, content split by ads that we removed, etc. **/ $articleContent = $this->dom->createElement('div'); $articleContent->setAttribute('id', 'readability-content'); $siblingScoreThreshold = max(10, (int) $topCandidate->getAttribute('readability') * 0.2); $siblingNodes = $topCandidate->parentNode->childNodes; if (!isset($siblingNodes)) { $siblingNodes = new stdClass(); $siblingNodes->length = 0; } for ($s = 0, $sl = $siblingNodes->length; $s < $sl; $s++) { $siblingNode = $siblingNodes->item($s); $siblingNodeName = $siblingNode->nodeName; $append = false; $this->dbg('Looking at sibling node: ' . $siblingNode->getNodePath() . ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') ? ' with score ' . $siblingNode->getAttribute('readability') : '')); //$this->dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); if ($siblingNode->isSameNode($topCandidate)) { $append = true; } $contentBonus = 0; // Give a bonus if sibling nodes and top candidates have the same classname. if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { $contentBonus += (int) $topCandidate->getAttribute('readability') * 0.2; } if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (int) $siblingNode->getAttribute('readability') + $contentBonus >= $siblingScoreThreshold) { $append = true; } if (strcasecmp($siblingNodeName, 'p') === 0) { $linkDensity = $this->getLinkDensity($siblingNode); $nodeContent = $this->getInnerText($siblingNode, true, true); $nodeLength = mb_strlen($nodeContent); if ($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY) { $append = true; } else { if ($nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\\.( |$)/', $nodeContent)) { $append = true; } } } if ($append) { $this->dbg('Appending node: ' . $siblingNode->getNodePath()); $nodeToAppend = null; if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) { /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ $this->dbg('Altering siblingNode ' . $siblingNodeName . ' to div.'); $nodeToAppend = $this->dom->createElement('div'); try { $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); $nodeToAppend->setAttribute('alt', $siblingNodeName); $nodeToAppend->innerHTML = $siblingNode->innerHTML; } catch (Exception $e) { $this->dbg('Could not alter siblingNode ' . $siblingNodeName . ' to div, reverting to original.'); $nodeToAppend = $siblingNode; $s--; $sl--; } } else { $nodeToAppend = $siblingNode; $s--; $sl--; } // To ensure a node does not interfere with readability styles, remove its classnames & ids. // Now done via RegExp post_filter. //$nodeToAppend->removeAttribute('class'); //$nodeToAppend->removeAttribute('id'); // Append sibling and subtract from our list as appending removes a node. $articleContent->appendChild($nodeToAppend); } } unset($xpath); // So we have all of the content that we need. Now we clean it up for presentation. $this->prepArticle($articleContent); /** * Now that we've gone through the full algorithm, check to see if we got any meaningful content. * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher * likelihood of finding the content, and the sieve approach gives us a higher likelihood of * finding the -right- content. **/ if (mb_strlen($this->getInnerText($articleContent, false)) < self::MIN_ARTICLE_LENGTH) { if (!$this->body->hasChildNodes()) { $this->body = $this->dom->createElement('body'); } $this->body->innerHTML = $this->bodyCache; if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); $this->dbg("...content is shorter than " . self::MIN_ARTICLE_LENGTH . " letters, trying not to strip unlikely content.\n"); return $this->grabArticle($this->body); } else { if ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES); $this->dbg("...content is shorter than " . self::MIN_ARTICLE_LENGTH . " letters, trying not to weight attributes.\n"); return $this->grabArticle($this->body); } else { if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); $this->dbg("...content is shorter than " . self::MIN_ARTICLE_LENGTH . " letters, trying not to clean at all.\n"); return $this->grabArticle($this->body); } else { return false; } } } } return $articleContent; }
protected function grabArticle($page = null) { $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); if (!$page) { $page = $this->dom; } $allElements = $page->getElementsByTagName('*'); /** * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) * * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 * TODO: Shouldn't this be a reverse traversal? **/ $node = null; $nodesToScore = array(); for ($nodeIndex = 0; $node = $allElements->item($nodeIndex); $nodeIndex++) { //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { //$node = $targetList->item($nodeIndex); $tagName = strtoupper($node->tagName); /* Remove unlikely candidates */ if ($stripUnlikelyCandidates) { $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); if (preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && $tagName != 'BODY') { $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); //$nodesToRemove[] = $node; $node->parentNode->removeChild($node); $nodeIndex--; continue; } } if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { $nodesToScore[] = $node; } /* Turn all divs that don't have children block level elements into p's */ if ($tagName == 'DIV') { if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { //$this->dbg('Altering div to p'); $newNode = $this->dom->createElement('p'); try { $newNode->innerHTML = $node->innerHTML; //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); $node->parentNode->replaceChild($newNode, $node); $nodeIndex--; $nodesToScore[] = $node; // or $newNode? } catch (Exception $e) { $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); } } else { /* EXPERIMENTAL */ // TODO: change these p elements back to text nodes after processing for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { $childNode = $node->childNodes->item($i); if ($childNode->nodeType == 3) { // XML_TEXT_NODE //$this->dbg('replacing text node with a p tag with the same content.'); $p = $this->dom->createElement('p'); $p->innerHTML = $childNode->nodeValue; $p->setAttribute('style', 'display: inline;'); $p->setAttribute('class', 'readability-styled'); $childNode->parentNode->replaceChild($p, $childNode); } } } } } /** * Loop through all paragraphs, and assign a score to them based on how content-y they look. * Then add their score to their parent node. * * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. **/ $candidates = array(); for ($pt = 0; $pt < count($nodesToScore); $pt++) { $parentNode = $nodesToScore[$pt]->parentNode; // $grandParentNode = $parentNode ? $parentNode->parentNode : null; $grandParentNode = !$parentNode ? null : ($parentNode->parentNode instanceof DOMElement ? $parentNode->parentNode : null); $innerText = $this->getInnerText($nodesToScore[$pt]); if (!$parentNode || !isset($parentNode->tagName)) { continue; } /* If this paragraph is less than 25 characters, don't even count it. */ if (strlen($innerText) < 25) { continue; } /* Initialize readability data for the parent. */ if (!$parentNode->hasAttribute('readability')) { $this->initializeNode($parentNode); $candidates[] = $parentNode; } /* Initialize readability data for the grandparent. */ if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) { $this->initializeNode($grandParentNode); $candidates[] = $grandParentNode; } $contentScore = 0; /* Add a point for the paragraph itself as a base. */ $contentScore++; /* Add points for any commas within this paragraph */ $contentScore += count(explode(',', $innerText)); /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ $contentScore += min(floor(strlen($innerText) / 100), 3); /* Add the score to the parent. The grandparent gets half. */ $parentNode->getAttributeNode('readability')->value += $contentScore; if ($grandParentNode) { $grandParentNode->getAttributeNode('readability')->value += $contentScore / 2; } } /** * After we've calculated scores, loop through all of the possible candidate nodes we found * and find the one with the highest score. **/ $topCandidate = null; for ($c = 0, $cl = count($candidates); $c < $cl; $c++) { /** * Scale the final candidates score based on link density. Good content should have a * relatively small link density (5% or less) and be mostly unaffected by this operation. **/ $readability = $candidates[$c]->getAttributeNode('readability'); $readability->value = $readability->value * (1 - $this->getLinkDensity($candidates[$c])); $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) { $topCandidate = $candidates[$c]; } } /** * If we still have no top candidate, just use the body as a last resort. * We also have to copy the body node so it is something we can modify. **/ if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') { $topCandidate = $this->dom->createElement('div'); if ($page instanceof DOMDocument) { if (!isset($page->documentElement)) { // we don't have a body either? what a mess! :) } else { $topCandidate->innerHTML = $page->documentElement->innerHTML; $page->documentElement->innerHTML = ''; $this->reinitBody(); $page->documentElement->appendChild($topCandidate); } } else { $topCandidate->innerHTML = $page->innerHTML; $page->innerHTML = ''; $page->appendChild($topCandidate); } $this->initializeNode($topCandidate); } /** * Now that we have the top candidate, look through its siblings for content that might also be related. * Things like preambles, content split by ads that we removed, etc. **/ $articleContent = $this->dom->createElement('div'); $articleContent->setAttribute('id', 'readability-content'); $siblingScoreThreshold = max(10, (int) $topCandidate->getAttribute('readability') * 0.2); $siblingNodes = @$topCandidate->parentNode->childNodes; if (!isset($siblingNodes)) { $siblingNodes = new stdClass(); $siblingNodes->length = 0; } for ($s = 0, $sl = $siblingNodes->length; $s < $sl; $s++) { $siblingNode = $siblingNodes->item($s); $append = false; $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') ? ' with score ' . $siblingNode->getAttribute('readability') : '')); //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); if ($siblingNode === $topCandidate) { $append = true; } $contentBonus = 0; /* Give a bonus if sibling nodes and top candidates have the example same classname */ if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { $contentBonus += (int) $topCandidate->getAttribute('readability') * 0.2; } if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (int) $siblingNode->getAttribute('readability') + $contentBonus >= $siblingScoreThreshold) { $append = true; } if (strtoupper($siblingNode->nodeName) == 'P') { $linkDensity = $this->getLinkDensity($siblingNode); $nodeContent = $this->getInnerText($siblingNode); $nodeLength = strlen($nodeContent); if ($nodeLength > 80 && $linkDensity < 0.25) { $append = true; } else { if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\\.( |$)/', $nodeContent)) { $append = true; } } } if ($append) { $this->dbg('Appending node: ' . $siblingNode->nodeName); $nodeToAppend = null; $sibNodeName = strtoupper($siblingNode->nodeName); if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); $nodeToAppend = $this->dom->createElement('div'); try { $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); $nodeToAppend->innerHTML = $siblingNode->innerHTML; } catch (Exception $e) { $this->dbg('Could not alter siblingNode to div, reverting back to original.'); $nodeToAppend = $siblingNode; $s--; $sl--; } } else { $nodeToAppend = $siblingNode; $s--; $sl--; } /* To ensure a node does not interfere with readability styles, remove its classnames */ $nodeToAppend->removeAttribute('class'); /* Append sibling and subtract from our list because it removes the node when you append to another node */ $articleContent->appendChild($nodeToAppend); } } /** * So we have all of the content that we need. Now we clean it up for presentation. **/ $this->prepArticle($articleContent); /** * Now that we've gone through the full algorithm, check to see if we got any meaningful content. * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher * likelihood of finding the content, and the sieve approach gives us a higher likelihood of * finding the -right- content. **/ if (strlen($this->getInnerText($articleContent, false)) < 250) { // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 // in the meantime, we check and create an empty element if it's not there. $this->reinitBody(); if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); return $this->grabArticle($this->body); } else { if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { $this->removeFlag(self::FLAG_WEIGHT_CLASSES); return $this->grabArticle($this->body); } else { if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); return $this->grabArticle($this->body); } else { return false; } } } } return $articleContent; }