protected function grabArticle($page = null)
 {
     if (!$page) {
         $page = $this->dom;
     }
     $xpath = null;
     $nodesToScore = array();
     if ($page instanceof DOMDocument && isset($page->documentElement)) {
         $xpath = new DOMXPath($page);
     }
     $allElements = $page->getElementsByTagName('*');
     for ($nodeIndex = 0; $node = $allElements->item($nodeIndex); $nodeIndex++) {
         $tagName = $node->tagName;
         // Some well known site uses sections as paragraphs.
         if (strcasecmp($tagName, 'p') === 0 || strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'section') === 0) {
             $nodesToScore[] = $node;
         }
         // Turn divs into P tags where they have been used inappropriately
         //  (as in, where they contain no other block level elements).
         if (strcasecmp($tagName, 'div') === 0 || strcasecmp($tagName, 'article') === 0 || strcasecmp($tagName, 'section') === 0) {
             if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
                 //$this->dbg('Altering '.$node->getNodePath().' to p');
                 $newNode = $this->dom->createElement('p');
                 try {
                     $newNode->innerHTML = $node->innerHTML;
                     // It's easier to debug using original attributes.
                     //$newNode->setAttribute('class', $node->getAttribute('class'));
                     //$newNode->setAttribute('id', $node->getAttribute('id'));
                     $node = $node->parentNode->replaceChild($newNode, $node);
                     $nodeIndex--;
                     $nodesToScore[] = $newNode;
                 } catch (Exception $e) {
                     $this->dbg('Could not alter div/article to p, reverting back to div: ' . $e->getMessage());
                 }
             } else {
                 // Will change these P elements back to text nodes after processing.
                 for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
                     $childNode = $node->childNodes->item($i);
                     if (is_object($childNode) && get_class($childNode) === 'DOMProcessingInstruction') {
                         //executable tags (<?php or <?xml) warning
                         $childNode->parentNode->removeChild($childNode);
                         continue;
                     }
                     if ($childNode->nodeType == 3) {
                         // XML_TEXT_NODE
                         //$this->dbg('replacing text node with a P tag with the same content.');
                         $p = $this->dom->createElement('p');
                         $p->innerHTML = $childNode->nodeValue;
                         $p->setAttribute('data-readability-styled', 'true');
                         $childNode->parentNode->replaceChild($p, $childNode);
                     }
                 }
             }
         }
     }
     /**
      * Loop through all paragraphs, and assign a score to them based on how content-y they look.
      * Then add their score to their parent node.
      *
      * A score is determined by things like number of commas, class names, etc.
      * Maybe eventually link density.
      **/
     for ($pt = 0, $scored = count($nodesToScore); $pt < $scored; $pt++) {
         $parentNode = $nodesToScore[$pt]->parentNode;
         // No parent node? Move on...
         if (!$parentNode) {
             continue;
         }
         $grandParentNode = $parentNode->parentNode instanceof DOMElement ? $parentNode->parentNode : null;
         $innerText = $this->getInnerText($nodesToScore[$pt]);
         // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
         if (mb_strlen($innerText) < self::MIN_PARAGRAPH_LENGTH) {
             continue;
         }
         // Initialize readability data for the parent.
         if (!$parentNode->hasAttribute('readability')) {
             $this->initializeNode($parentNode);
             $parentNode->setAttribute('data-candidate', 'true');
         }
         // Initialize readability data for the grandparent.
         if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) {
             $this->initializeNode($grandParentNode);
             $grandParentNode->setAttribute('data-candidate', 'true');
         }
         // Add a point for the paragraph itself as a base.
         $contentScore = 1;
         // Add points for any commas within this paragraph.
         $contentScore += $this->getCommaCount($innerText);
         // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
         $contentScore += min(floor(mb_strlen($innerText) / self::SCORE_CHARS_IN_PARAGRAPH), 3);
         // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
         $contentScore += min(floor($this->getWordCount($innerText) / self::SCORE_WORDS_IN_PARAGRAPH), 3);
         /* TEST: For every positive/negative parent tag, add/substract half point. Up to 3 points. *\/
         			$up = $nodesToScore[$pt];
         			$score = 0;
         			while ($up->parentNode instanceof DOMElement) {
         				$up = $up->parentNode;
         				if (preg_match($this->regexps['positive'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
         					$score += 0.5;
         				} else if (preg_match($this->regexps['negative'], $up->getAttribute('class') . ' ' . $up->getAttribute('id'))) {
         					$score -= 0.5;
         				}
         			}
         			$score = floor($score);
         			$contentScore += max(min($score, 3), -3);/**/
         // Add the score to the parent. The grandparent gets half.
         $parentNode->getAttributeNode('readability')->value += $contentScore;
         if ($grandParentNode) {
             $grandParentNode->getAttributeNode('readability')->value += $contentScore / self::GRANDPARENT_SCORE_DIVISOR;
         }
     }
     /**
      * Node prepping: trash nodes that look cruddy (like ones with the class name "comment", etc).
      * This is faster to do before scoring but safer after.
      */
     if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS) && $xpath) {
         $candidates = $xpath->query('.//*[(self::footer and count(//footer)<2) or (self::aside and count(//aside)<2)]', $page->documentElement);
         for ($node = null, $c = $candidates->length - 1; $c >= 0; $c--) {
             $node = $candidates->item($c);
             // node should be readable but not inside of an article otherwise it's probably non-readable block
             if ($node->hasAttribute('readability') && (int) $node->getAttributeNode('readability')->value < 40 && ($node->parentNode ? strcasecmp($node->parentNode->tagName, 'article') !== 0 : true)) {
                 $this->dbg('Removing unlikely candidate ' . $node->getNodePath() . ' by "' . $node->tagName . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
                 $node->parentNode->removeChild($node);
             }
         }
         $candidates = $xpath->query('.//*[not(self::body) and (@class or @id or @style) and ((number(@readability) < 40) or not(@readability))]', $page->documentElement);
         for ($node = null, $c = $candidates->length - 1; $c >= 0; $c--) {
             $node = $candidates->item($c);
             $tagName = $node->tagName;
             /* Remove unlikely candidates */
             $unlikelyMatchString = $node->getAttribute('class') . " " . $node->getAttribute('id') . " " . $node->getAttribute('style');
             //$this->dbg('Processing '.$node->getNodePath().' by "'. $unlikelyMatchString.'" with readability '.($node->hasAttribute('readability') ? (int)$node->getAttributeNode('readability')->value : 0));
             if (mb_strlen($unlikelyMatchString) > 3 && preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString)) {
                 $this->dbg('Removing unlikely candidate ' . $node->getNodePath() . ' by "' . $unlikelyMatchString . '" with readability ' . ($node->hasAttribute('readability') ? (int) $node->getAttributeNode('readability')->value : 0));
                 $node->parentNode->removeChild($node);
                 $nodeIndex--;
             }
         }
         unset($candidates);
     }
     /**
      * After we've calculated scores, loop through all of the possible candidate nodes we found
      * and find the one with the highest score.
      **/
     $topCandidate = null;
     if ($xpath) {
         // Using array of DOMElements after deletion is a path to DOOMElement.
         $candidates = $xpath->query('.//*[@data-candidate]', $page->documentElement);
         for ($c = $candidates->length - 1; $c >= 0; $c--) {
             // Scale the final candidates score based on link density. Good content should have a
             // relatively small link density (5% or less) and be mostly unaffected by this operation.
             // If not for this we would have used XPath to find maximum @readability.
             $readability = $candidates->item($c)->getAttributeNode('readability');
             $readability->value = round($readability->value * (1 - $this->getLinkDensity($candidates->item($c))), 0, PHP_ROUND_HALF_UP);
             if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) {
                 $this->dbg('Candidate: ' . $candidates->item($c)->getNodePath() . ' (' . $candidates->item($c)->getAttribute('class') . ':' . $candidates->item($c)->getAttribute('id') . ') with score ' . $readability->value);
                 $topCandidate = $candidates->item($c);
             }
         }
         unset($candidates);
     }
     /**
      * If we still have no top candidate, just use the body as a last resort.
      * We also have to copy the body node so it is something we can modify.
      **/
     if ($topCandidate === null || strcasecmp($topCandidate->tagName, 'body') === 0) {
         $topCandidate = $this->dom->createElement('div');
         if ($page instanceof DOMDocument) {
             if (!isset($page->documentElement)) {
                 // we don't have a body either? what a mess! :)
                 $this->dbg('The page has no body!');
             } else {
                 $this->dbg('Setting body to a raw HTML of original page!');
                 $topCandidate->innerHTML = $page->documentElement->innerHTML;
                 $page->documentElement->innerHTML = '';
                 $page->documentElement->appendChild($topCandidate);
             }
         } else {
             $topCandidate->innerHTML = $page->innerHTML;
             $page->innerHTML = '';
             $page->appendChild($topCandidate);
         }
         $this->initializeNode($topCandidate);
     }
     // Set table as the main node if resulted data is table element.
     $tagName = $topCandidate->tagName;
     if (strcasecmp($tagName, 'td') === 0 || strcasecmp($tagName, 'tr') === 0) {
         $up = $topCandidate;
         while ($up->parentNode instanceof DOMElement) {
             $up = $up->parentNode;
             if (strcasecmp($up->tagName, 'table') === 0) {
                 $topCandidate = $up;
                 break;
             }
         }
     }
     $this->dbg('Top candidate: ' . $topCandidate->getNodePath());
     /**
      * Now that we have the top candidate, look through its siblings for content that might also be related.
      * Things like preambles, content split by ads that we removed, etc.
      **/
     $articleContent = $this->dom->createElement('div');
     $articleContent->setAttribute('id', 'readability-content');
     $siblingScoreThreshold = max(10, (int) $topCandidate->getAttribute('readability') * 0.2);
     $siblingNodes = $topCandidate->parentNode->childNodes;
     if (!isset($siblingNodes)) {
         $siblingNodes = new stdClass();
         $siblingNodes->length = 0;
     }
     for ($s = 0, $sl = $siblingNodes->length; $s < $sl; $s++) {
         $siblingNode = $siblingNodes->item($s);
         $siblingNodeName = $siblingNode->nodeName;
         $append = false;
         $this->dbg('Looking at sibling node: ' . $siblingNode->getNodePath() . ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') ? ' with score ' . $siblingNode->getAttribute('readability') : ''));
         //$this->dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
         if ($siblingNode->isSameNode($topCandidate)) {
             $append = true;
         }
         $contentBonus = 0;
         // Give a bonus if sibling nodes and top candidates have the same classname.
         if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
             $contentBonus += (int) $topCandidate->getAttribute('readability') * 0.2;
         }
         if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (int) $siblingNode->getAttribute('readability') + $contentBonus >= $siblingScoreThreshold) {
             $append = true;
         }
         if (strcasecmp($siblingNodeName, 'p') === 0) {
             $linkDensity = $this->getLinkDensity($siblingNode);
             $nodeContent = $this->getInnerText($siblingNode, true, true);
             $nodeLength = mb_strlen($nodeContent);
             if ($nodeLength > self::MIN_NODE_LENGTH && $linkDensity < self::MAX_LINK_DENSITY) {
                 $append = true;
             } else {
                 if ($nodeLength < self::MIN_NODE_LENGTH && $linkDensity === 0 && preg_match('/\\.( |$)/', $nodeContent)) {
                     $append = true;
                 }
             }
         }
         if ($append) {
             $this->dbg('Appending node: ' . $siblingNode->getNodePath());
             $nodeToAppend = null;
             if (strcasecmp($siblingNodeName, 'div') !== 0 && strcasecmp($siblingNodeName, 'p') !== 0) {
                 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
                 $this->dbg('Altering siblingNode ' . $siblingNodeName . ' to div.');
                 $nodeToAppend = $this->dom->createElement('div');
                 try {
                     $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
                     $nodeToAppend->setAttribute('alt', $siblingNodeName);
                     $nodeToAppend->innerHTML = $siblingNode->innerHTML;
                 } catch (Exception $e) {
                     $this->dbg('Could not alter siblingNode ' . $siblingNodeName . ' to div, reverting to original.');
                     $nodeToAppend = $siblingNode;
                     $s--;
                     $sl--;
                 }
             } else {
                 $nodeToAppend = $siblingNode;
                 $s--;
                 $sl--;
             }
             // To ensure a node does not interfere with readability styles, remove its classnames & ids.
             // Now done via RegExp post_filter.
             //$nodeToAppend->removeAttribute('class');
             //$nodeToAppend->removeAttribute('id');
             // Append sibling and subtract from our list as appending removes a node.
             $articleContent->appendChild($nodeToAppend);
         }
     }
     unset($xpath);
     // So we have all of the content that we need. Now we clean it up for presentation.
     $this->prepArticle($articleContent);
     /**
      * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
      * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
      * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
      * finding the -right- content.
      **/
     if (mb_strlen($this->getInnerText($articleContent, false)) < self::MIN_ARTICLE_LENGTH) {
         if (!$this->body->hasChildNodes()) {
             $this->body = $this->dom->createElement('body');
         }
         $this->body->innerHTML = $this->bodyCache;
         if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
             $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
             $this->dbg("...content is shorter than " . self::MIN_ARTICLE_LENGTH . " letters, trying not to strip unlikely content.\n");
             return $this->grabArticle($this->body);
         } else {
             if ($this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) {
                 $this->removeFlag(self::FLAG_WEIGHT_ATTRIBUTES);
                 $this->dbg("...content is shorter than " . self::MIN_ARTICLE_LENGTH . " letters, trying not to weight attributes.\n");
                 return $this->grabArticle($this->body);
             } else {
                 if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
                     $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
                     $this->dbg("...content is shorter than " . self::MIN_ARTICLE_LENGTH . " letters, trying not to clean at all.\n");
                     return $this->grabArticle($this->body);
                 } else {
                     return false;
                 }
             }
         }
     }
     return $articleContent;
 }
Example #2
0
 protected function grabArticle($page = null)
 {
     $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
     if (!$page) {
         $page = $this->dom;
     }
     $allElements = $page->getElementsByTagName('*');
     /**
      * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
      * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
      *
      * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
      * TODO: Shouldn't this be a reverse traversal?
      **/
     $node = null;
     $nodesToScore = array();
     for ($nodeIndex = 0; $node = $allElements->item($nodeIndex); $nodeIndex++) {
         //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
         //$node = $targetList->item($nodeIndex);
         $tagName = strtoupper($node->tagName);
         /* Remove unlikely candidates */
         if ($stripUnlikelyCandidates) {
             $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
             if (preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && $tagName != 'BODY') {
                 $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
                 //$nodesToRemove[] = $node;
                 $node->parentNode->removeChild($node);
                 $nodeIndex--;
                 continue;
             }
         }
         if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
             $nodesToScore[] = $node;
         }
         /* Turn all divs that don't have children block level elements into p's */
         if ($tagName == 'DIV') {
             if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
                 //$this->dbg('Altering div to p');
                 $newNode = $this->dom->createElement('p');
                 try {
                     $newNode->innerHTML = $node->innerHTML;
                     //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
                     $node->parentNode->replaceChild($newNode, $node);
                     $nodeIndex--;
                     $nodesToScore[] = $node;
                     // or $newNode?
                 } catch (Exception $e) {
                     $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
                 }
             } else {
                 /* EXPERIMENTAL */
                 // TODO: change these p elements back to text nodes after processing
                 for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
                     $childNode = $node->childNodes->item($i);
                     if ($childNode->nodeType == 3) {
                         // XML_TEXT_NODE
                         //$this->dbg('replacing text node with a p tag with the same content.');
                         $p = $this->dom->createElement('p');
                         $p->innerHTML = $childNode->nodeValue;
                         $p->setAttribute('style', 'display: inline;');
                         $p->setAttribute('class', 'readability-styled');
                         $childNode->parentNode->replaceChild($p, $childNode);
                     }
                 }
             }
         }
     }
     /**
      * Loop through all paragraphs, and assign a score to them based on how content-y they look.
      * Then add their score to their parent node.
      *
      * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
      **/
     $candidates = array();
     for ($pt = 0; $pt < count($nodesToScore); $pt++) {
         $parentNode = $nodesToScore[$pt]->parentNode;
         // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
         $grandParentNode = !$parentNode ? null : ($parentNode->parentNode instanceof DOMElement ? $parentNode->parentNode : null);
         $innerText = $this->getInnerText($nodesToScore[$pt]);
         if (!$parentNode || !isset($parentNode->tagName)) {
             continue;
         }
         /* If this paragraph is less than 25 characters, don't even count it. */
         if (strlen($innerText) < 25) {
             continue;
         }
         /* Initialize readability data for the parent. */
         if (!$parentNode->hasAttribute('readability')) {
             $this->initializeNode($parentNode);
             $candidates[] = $parentNode;
         }
         /* Initialize readability data for the grandparent. */
         if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) {
             $this->initializeNode($grandParentNode);
             $candidates[] = $grandParentNode;
         }
         $contentScore = 0;
         /* Add a point for the paragraph itself as a base. */
         $contentScore++;
         /* Add points for any commas within this paragraph */
         $contentScore += count(explode(',', $innerText));
         /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
         $contentScore += min(floor(strlen($innerText) / 100), 3);
         /* Add the score to the parent. The grandparent gets half. */
         $parentNode->getAttributeNode('readability')->value += $contentScore;
         if ($grandParentNode) {
             $grandParentNode->getAttributeNode('readability')->value += $contentScore / 2;
         }
     }
     /**
      * After we've calculated scores, loop through all of the possible candidate nodes we found
      * and find the one with the highest score.
      **/
     $topCandidate = null;
     for ($c = 0, $cl = count($candidates); $c < $cl; $c++) {
         /**
          * Scale the final candidates score based on link density. Good content should have a
          * relatively small link density (5% or less) and be mostly unaffected by this operation.
          **/
         $readability = $candidates[$c]->getAttributeNode('readability');
         $readability->value = $readability->value * (1 - $this->getLinkDensity($candidates[$c]));
         $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
         if (!$topCandidate || $readability->value > (int) $topCandidate->getAttribute('readability')) {
             $topCandidate = $candidates[$c];
         }
     }
     /**
      * If we still have no top candidate, just use the body as a last resort.
      * We also have to copy the body node so it is something we can modify.
      **/
     if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') {
         $topCandidate = $this->dom->createElement('div');
         if ($page instanceof DOMDocument) {
             if (!isset($page->documentElement)) {
                 // we don't have a body either? what a mess! :)
             } else {
                 $topCandidate->innerHTML = $page->documentElement->innerHTML;
                 $page->documentElement->innerHTML = '';
                 $this->reinitBody();
                 $page->documentElement->appendChild($topCandidate);
             }
         } else {
             $topCandidate->innerHTML = $page->innerHTML;
             $page->innerHTML = '';
             $page->appendChild($topCandidate);
         }
         $this->initializeNode($topCandidate);
     }
     /**
      * Now that we have the top candidate, look through its siblings for content that might also be related.
      * Things like preambles, content split by ads that we removed, etc.
      **/
     $articleContent = $this->dom->createElement('div');
     $articleContent->setAttribute('id', 'readability-content');
     $siblingScoreThreshold = max(10, (int) $topCandidate->getAttribute('readability') * 0.2);
     $siblingNodes = @$topCandidate->parentNode->childNodes;
     if (!isset($siblingNodes)) {
         $siblingNodes = new stdClass();
         $siblingNodes->length = 0;
     }
     for ($s = 0, $sl = $siblingNodes->length; $s < $sl; $s++) {
         $siblingNode = $siblingNodes->item($s);
         $append = false;
         $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') ? ' with score ' . $siblingNode->getAttribute('readability') : ''));
         //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
         if ($siblingNode === $topCandidate) {
             $append = true;
         }
         $contentBonus = 0;
         /* Give a bonus if sibling nodes and top candidates have the example same classname */
         if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
             $contentBonus += (int) $topCandidate->getAttribute('readability') * 0.2;
         }
         if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (int) $siblingNode->getAttribute('readability') + $contentBonus >= $siblingScoreThreshold) {
             $append = true;
         }
         if (strtoupper($siblingNode->nodeName) == 'P') {
             $linkDensity = $this->getLinkDensity($siblingNode);
             $nodeContent = $this->getInnerText($siblingNode);
             $nodeLength = strlen($nodeContent);
             if ($nodeLength > 80 && $linkDensity < 0.25) {
                 $append = true;
             } else {
                 if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\\.( |$)/', $nodeContent)) {
                     $append = true;
                 }
             }
         }
         if ($append) {
             $this->dbg('Appending node: ' . $siblingNode->nodeName);
             $nodeToAppend = null;
             $sibNodeName = strtoupper($siblingNode->nodeName);
             if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
                 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
                 $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
                 $nodeToAppend = $this->dom->createElement('div');
                 try {
                     $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
                     $nodeToAppend->innerHTML = $siblingNode->innerHTML;
                 } catch (Exception $e) {
                     $this->dbg('Could not alter siblingNode to div, reverting back to original.');
                     $nodeToAppend = $siblingNode;
                     $s--;
                     $sl--;
                 }
             } else {
                 $nodeToAppend = $siblingNode;
                 $s--;
                 $sl--;
             }
             /* To ensure a node does not interfere with readability styles, remove its classnames */
             $nodeToAppend->removeAttribute('class');
             /* Append sibling and subtract from our list because it removes the node when you append to another node */
             $articleContent->appendChild($nodeToAppend);
         }
     }
     /**
      * So we have all of the content that we need. Now we clean it up for presentation.
      **/
     $this->prepArticle($articleContent);
     /**
      * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
      * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
      * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
      * finding the -right- content.
      **/
     if (strlen($this->getInnerText($articleContent, false)) < 250) {
         // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
         // in the meantime, we check and create an empty element if it's not there.
         $this->reinitBody();
         if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
             $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
             return $this->grabArticle($this->body);
         } else {
             if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
                 $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
                 return $this->grabArticle($this->body);
             } else {
                 if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
                     $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
                     return $this->grabArticle($this->body);
                 } else {
                     return false;
                 }
             }
         }
     }
     return $articleContent;
 }