/** * Checks the density of links within a node, is there not much text and most of it contains linky shit? * if so it's no good * * @param Element $node * @param double $limit * * @return bool */ private function isHighLinkDensity(Element $node, $limit = 1.0) { $links = $node->find('a, [onclick]'); if ($links->count() == 0) { return false; } $words = preg_split('@[\\s]+@iu', $node->text(), -1, PREG_SPLIT_NO_EMPTY); if (count($words) == 0) { return false; } $sb = []; foreach ($links as $link) { $sb[] = Helper::textNormalise($link->text()); } $linkText = implode('', $sb); $linkWords = explode(' ', $linkText); $numberOfLinkWords = count($linkWords); $numberOfLinks = $links->count(); $linkDivisor = $numberOfLinkWords / count($words); $score = $linkDivisor * $numberOfLinks; if ($score >= $limit) { return true; } return false; }
/** * @param Element $node * * @return LocallyStoredImage[] */ private function getImageCandidates(Element $node) { $images = $node->find('img'); $filteredImages = $this->filterBadNames($images); $goodImages = $this->findImagesThatPassByteSizeTest($filteredImages); return $goodImages; }
/** * we could have long articles that have tons of paragraphs so if we tried to calculate the base score against * the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring * of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of * 100 then 100 should be our base. * * @param Element $topNode * * @return int */ private function getBaselineScoreForSiblings(Element $topNode) { $base = 100000; $numberOfParagraphs = 0; $scoreOfParagraphs = 0; $nodesToCheck = $topNode->find('p, strong'); foreach ($nodesToCheck as $node) { $nodeText = $node->text(); $wordStats = $this->config()->getStopWords()->getStopwordCount($nodeText); $highLinkDensity = $this->isHighLinkDensity($node); if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) { $numberOfParagraphs += 1; $scoreOfParagraphs += $wordStats->getStopWordCount(); } } if ($numberOfParagraphs > 0) { $base = $scoreOfParagraphs / $numberOfParagraphs; } return $base; }