/** * Checks the density of links within a node, is there not much text and most of it contains linky shit? * if so it's no good * * @param Element $node * @param double $limit * * @return bool */ private function isHighLinkDensity(Element $node, $limit = 1.0) { $links = $node->find('a, [onclick]'); if ($links->count() == 0) { return false; } $words = preg_split('@[\\s]+@iu', $node->text(), -1, PREG_SPLIT_NO_EMPTY); if (count($words) == 0) { return false; } $sb = []; foreach ($links as $link) { $sb[] = Helper::textNormalise($link->text()); } $linkText = implode('', $sb); $linkWords = explode(' ', $linkText); $numberOfLinkWords = count($linkWords); $numberOfLinks = $links->count(); $linkDivisor = $numberOfLinkWords / count($words); $score = $linkDivisor * $numberOfLinks; if ($score >= $limit) { return true; } return false; }
/** * @param Element $node * @param int $i * @param int $totalNodes * * @return double */ private function getTopNodeCandidateScore(Element $node, $i, $totalNodes) { $boostScore = 1.0 / ($i + 1) * 50; $bottomNodesForNegativeScore = $totalNodes * 0.25; if ($totalNodes > 15) { if ($totalNodes - $i <= $bottomNodesForNegativeScore) { $booster = $bottomNodesForNegativeScore - ($totalNodes - $i); $boostScore = pow($booster, 2) * -1; $negscore = abs($boostScore); if ($negscore > 40) { $boostScore = 5; } } } $wordStats = $this->config()->getStopWords()->getStopwordCount($node->text()); $upscore = $wordStats->getStopWordCount() + $boostScore; return $upscore; }
/** * Adds any siblings that may have a decent score to this node * * @param Element $currentSibling * @param int $baselineScoreForSiblingParagraphs * * @return Element[] */ private function getSiblingContent(Element $currentSibling, $baselineScoreForSiblingParagraphs) { $text = trim($currentSibling->text()); if ($currentSibling->is('p, strong') && !empty($text)) { return [$currentSibling]; } $results = []; $nodes = $currentSibling->find('p, strong'); foreach ($nodes as $node) { $text = trim($node->text()); if (!empty($text)) { $wordStats = $this->config()->getStopWords()->getStopwordCount($text); if ($baselineScoreForSiblingParagraphs * self::$SIBLING_BASE_LINE_SCORE < $wordStats->getStopWordCount()) { $results[] = $node->document()->createElement('p', $text); } } } return $results; }