/** * Checks the density of links within a node, is there not much text and most of it contains linky shit? * if so it's no good * * @param Element $node * @param double $limit * * @return bool */ private function isHighLinkDensity(Element $node, $limit = 1.0) { $links = $node->find('a, [onclick]'); if ($links->count() == 0) { return false; } $words = preg_split('@[\\s]+@iu', $node->text(), -1, PREG_SPLIT_NO_EMPTY); if (count($words) == 0) { return false; } $sb = []; foreach ($links as $link) { $sb[] = Helper::textNormalise($link->text()); } $linkText = implode('', $sb); $linkWords = explode(' ', $linkText); $numberOfLinkWords = count($linkWords); $numberOfLinks = $links->count(); $linkDivisor = $numberOfLinkWords / count($words); $score = $linkDivisor * $numberOfLinks; if ($score >= $limit) { return true; } return false; }
/** * @param Element $node * * @return LocallyStoredImage[] */ private function getImageCandidates(Element $node) { $images = $node->find('img'); $filteredImages = $this->filterBadNames($images); $goodImages = $this->findImagesThatPassByteSizeTest($filteredImages); return $goodImages; }
/** * Stores how many decent nodes are under a parent node * * @param Element $node * @param int $addToCount */ private function updateNodeCount(Element $node, $addToCount) { $currentScore = (int) $node->attr('gravityNodes'); $node->attr('gravityNodes', $currentScore + $addToCount); }
/** * we could have long articles that have tons of paragraphs so if we tried to calculate the base score against * the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring * of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of * 100 then 100 should be our base. * * @param Element $topNode * * @return int */ private function getBaselineScoreForSiblings(Element $topNode) { $base = 100000; $numberOfParagraphs = 0; $scoreOfParagraphs = 0; $nodesToCheck = $topNode->find('p, strong'); foreach ($nodesToCheck as $node) { $nodeText = $node->text(); $wordStats = $this->config()->getStopWords()->getStopwordCount($nodeText); $highLinkDensity = $this->isHighLinkDensity($node); if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) { $numberOfParagraphs += 1; $scoreOfParagraphs += $wordStats->getStopWordCount(); } } if ($numberOfParagraphs > 0) { $base = $scoreOfParagraphs / $numberOfParagraphs; } return $base; }
/** * Generate <p> element replacements for supplied elements child nodes as required. * * @param Element $node * * @return \DOMWrap\NodeList $nodesToReturn Replacement elements */ private function getReplacementNodes(Element $node) { $nodesToReturn = $node->newNodeList(); $nodesToRemove = $node->newNodeList(); $replacementNodes = $node->newNodeList(); $fnCompareSiblingNodes = function ($node) { if ($node->is(':not(a)') || $node->nodeType == XML_TEXT_NODE) { return true; } }; foreach ($node->contents() as $child) { if ($child->is('p') && $replacementNodes->count()) { $nodesToReturn[] = $this->getFlushedBuffer($replacementNodes); $replacementNodes->fromArray([]); $nodesToReturn[] = $child; } else { if ($child->nodeType == XML_TEXT_NODE) { $replaceText = $child->text(); if (!empty($replaceText)) { // Get all previous sibling <a> nodes, the current text node, and all next sibling <a> nodes. $siblings = $child->precedingUntil($fnCompareSiblingNodes, 'a')->merge([$child])->merge($child->followingUntil($fnCompareSiblingNodes, 'a')); foreach ($siblings as $sibling) { // Place current nodes textual contents in-between previous and next nodes. if ($sibling->isSameNode($child)) { $replacementNodes[] = new Text($replaceText); // Grab the contents of any unprocessed <a> siblings and flag them for removal. } else { if ($sibling->getAttribute('grv-usedalready') != 'yes') { $sibling->setAttribute('grv-usedalready', 'yes'); $replacementNodes[] = $sibling->cloneNode(true); $nodesToRemove[] = $sibling; } } } } $nodesToRemove[] = $child; } else { if ($replacementNodes->count()) { $nodesToReturn[] = $this->getFlushedBuffer($replacementNodes); $replacementNodes->fromArray([]); } $nodesToReturn[] = $child; } } } // Flush any remaining replacementNodes left over from text nodes. if ($replacementNodes->count()) { $nodesToReturn[] = $this->getFlushedBuffer($replacementNodes); } // Remove potential duplicate <a> tags. foreach ($nodesToReturn as $key => $return) { if ($nodesToRemove->exists($return)) { unset($nodesToReturn[$key]); } } $nodesToRemove->remove(); return $nodesToReturn; }
/** * A lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to * boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs * so we'll want to make sure that the next sibling is a paragraph and has at least some substantial weight to it * * @param Element $node * * @return bool */ private function isOkToBoost(Element $node) { $stepsAway = 0; $minimumStopWordCount = 5; $maxStepsAwayFromNode = 3; // Find all previous sibling element nodes $siblings = $node->precedingAll(function ($node) { return $node instanceof Element; }); foreach ($siblings as $sibling) { if ($sibling->is('p, strong')) { if ($stepsAway >= $maxStepsAwayFromNode) { return false; } $wordStats = $this->config()->getStopWords()->getStopwordCount($sibling->text()); if ($wordStats->getStopWordCount() > $minimumStopWordCount) { return true; } $stepsAway += 1; } } return false; }