Example #1
1
 /**
  * Checks the density of links within a node, is there not much text and most of it contains linky shit?
  * if so it's no good
  *
  * @param Element $node
  * @param double $limit
  *
  * @return bool
  */
 private function isHighLinkDensity(Element $node, $limit = 1.0)
 {
     $links = $node->find('a, [onclick]');
     if ($links->count() == 0) {
         return false;
     }
     $words = preg_split('@[\\s]+@iu', $node->text(), -1, PREG_SPLIT_NO_EMPTY);
     if (count($words) == 0) {
         return false;
     }
     $sb = [];
     foreach ($links as $link) {
         $sb[] = Helper::textNormalise($link->text());
     }
     $linkText = implode('', $sb);
     $linkWords = explode(' ', $linkText);
     $numberOfLinkWords = count($linkWords);
     $numberOfLinks = $links->count();
     $linkDivisor = $numberOfLinkWords / count($words);
     $score = $linkDivisor * $numberOfLinks;
     if ($score >= $limit) {
         return true;
     }
     return false;
 }
Example #2
1
 /**
  * @param Element $node
  *
  * @return LocallyStoredImage[]
  */
 private function getImageCandidates(Element $node)
 {
     $images = $node->find('img');
     $filteredImages = $this->filterBadNames($images);
     $goodImages = $this->findImagesThatPassByteSizeTest($filteredImages);
     return $goodImages;
 }
 /**
  * Stores how many decent nodes are under a parent node
  *
  * @param Element $node
  * @param int $addToCount
  */
 private function updateNodeCount(Element $node, $addToCount)
 {
     $currentScore = (int) $node->attr('gravityNodes');
     $node->attr('gravityNodes', $currentScore + $addToCount);
 }
Example #4
0
 /**
  * we could have long articles that have tons of paragraphs so if we tried to calculate the base score against
  * the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring
  * of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of
  * 100 then 100 should be our base.
  *
  * @param Element $topNode
  *
  * @return int
  */
 private function getBaselineScoreForSiblings(Element $topNode)
 {
     $base = 100000;
     $numberOfParagraphs = 0;
     $scoreOfParagraphs = 0;
     $nodesToCheck = $topNode->find('p, strong');
     foreach ($nodesToCheck as $node) {
         $nodeText = $node->text();
         $wordStats = $this->config()->getStopWords()->getStopwordCount($nodeText);
         $highLinkDensity = $this->isHighLinkDensity($node);
         if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) {
             $numberOfParagraphs += 1;
             $scoreOfParagraphs += $wordStats->getStopWordCount();
         }
     }
     if ($numberOfParagraphs > 0) {
         $base = $scoreOfParagraphs / $numberOfParagraphs;
     }
     return $base;
 }
Example #5
0
 /**
  * Generate <p> element replacements for supplied elements child nodes as required.
  *
  * @param Element $node
  *
  * @return \DOMWrap\NodeList $nodesToReturn Replacement elements
  */
 private function getReplacementNodes(Element $node)
 {
     $nodesToReturn = $node->newNodeList();
     $nodesToRemove = $node->newNodeList();
     $replacementNodes = $node->newNodeList();
     $fnCompareSiblingNodes = function ($node) {
         if ($node->is(':not(a)') || $node->nodeType == XML_TEXT_NODE) {
             return true;
         }
     };
     foreach ($node->contents() as $child) {
         if ($child->is('p') && $replacementNodes->count()) {
             $nodesToReturn[] = $this->getFlushedBuffer($replacementNodes);
             $replacementNodes->fromArray([]);
             $nodesToReturn[] = $child;
         } else {
             if ($child->nodeType == XML_TEXT_NODE) {
                 $replaceText = $child->text();
                 if (!empty($replaceText)) {
                     // Get all previous sibling <a> nodes, the current text node, and all next sibling <a> nodes.
                     $siblings = $child->precedingUntil($fnCompareSiblingNodes, 'a')->merge([$child])->merge($child->followingUntil($fnCompareSiblingNodes, 'a'));
                     foreach ($siblings as $sibling) {
                         // Place current nodes textual contents in-between previous and next nodes.
                         if ($sibling->isSameNode($child)) {
                             $replacementNodes[] = new Text($replaceText);
                             // Grab the contents of any unprocessed <a> siblings and flag them for removal.
                         } else {
                             if ($sibling->getAttribute('grv-usedalready') != 'yes') {
                                 $sibling->setAttribute('grv-usedalready', 'yes');
                                 $replacementNodes[] = $sibling->cloneNode(true);
                                 $nodesToRemove[] = $sibling;
                             }
                         }
                     }
                 }
                 $nodesToRemove[] = $child;
             } else {
                 if ($replacementNodes->count()) {
                     $nodesToReturn[] = $this->getFlushedBuffer($replacementNodes);
                     $replacementNodes->fromArray([]);
                 }
                 $nodesToReturn[] = $child;
             }
         }
     }
     // Flush any remaining replacementNodes left over from text nodes.
     if ($replacementNodes->count()) {
         $nodesToReturn[] = $this->getFlushedBuffer($replacementNodes);
     }
     // Remove potential duplicate <a> tags.
     foreach ($nodesToReturn as $key => $return) {
         if ($nodesToRemove->exists($return)) {
             unset($nodesToReturn[$key]);
         }
     }
     $nodesToRemove->remove();
     return $nodesToReturn;
 }
 /**
  * A lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to
  * boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs
  * so we'll want to make sure that the next sibling is a paragraph and has at least some substantial weight to it
  *
  * @param Element $node
  *
  * @return bool
  */
 private function isOkToBoost(Element $node)
 {
     $stepsAway = 0;
     $minimumStopWordCount = 5;
     $maxStepsAwayFromNode = 3;
     // Find all previous sibling element nodes
     $siblings = $node->precedingAll(function ($node) {
         return $node instanceof Element;
     });
     foreach ($siblings as $sibling) {
         if ($sibling->is('p, strong')) {
             if ($stepsAway >= $maxStepsAwayFromNode) {
                 return false;
             }
             $wordStats = $this->config()->getStopWords()->getStopwordCount($sibling->text());
             if ($wordStats->getStopWordCount() > $minimumStopWordCount) {
                 return true;
             }
             $stepsAway += 1;
         }
     }
     return false;
 }