Esempio n. 1
1
 /**
  * Checks the density of links within a node, is there not much text and most of it contains linky shit?
  * if so it's no good
  *
  * @param Element $node
  * @param double $limit
  *
  * @return bool
  */
 private function isHighLinkDensity(Element $node, $limit = 1.0)
 {
     $links = $node->find('a, [onclick]');
     if ($links->count() == 0) {
         return false;
     }
     $words = preg_split('@[\\s]+@iu', $node->text(), -1, PREG_SPLIT_NO_EMPTY);
     if (count($words) == 0) {
         return false;
     }
     $sb = [];
     foreach ($links as $link) {
         $sb[] = Helper::textNormalise($link->text());
     }
     $linkText = implode('', $sb);
     $linkWords = explode(' ', $linkText);
     $numberOfLinkWords = count($linkWords);
     $numberOfLinks = $links->count();
     $linkDivisor = $numberOfLinkWords / count($words);
     $score = $linkDivisor * $numberOfLinks;
     if ($score >= $limit) {
         return true;
     }
     return false;
 }
Esempio n. 2
0
 /**
  * @param Element $node
  * @param int $i
  * @param int $totalNodes
  *
  * @return double
  */
 private function getTopNodeCandidateScore(Element $node, $i, $totalNodes)
 {
     $boostScore = 1.0 / ($i + 1) * 50;
     $bottomNodesForNegativeScore = $totalNodes * 0.25;
     if ($totalNodes > 15) {
         if ($totalNodes - $i <= $bottomNodesForNegativeScore) {
             $booster = $bottomNodesForNegativeScore - ($totalNodes - $i);
             $boostScore = pow($booster, 2) * -1;
             $negscore = abs($boostScore);
             if ($negscore > 40) {
                 $boostScore = 5;
             }
         }
     }
     $wordStats = $this->config()->getStopWords()->getStopwordCount($node->text());
     $upscore = $wordStats->getStopWordCount() + $boostScore;
     return $upscore;
 }
Esempio n. 3
0
 /**
  * Adds any siblings that may have a decent score to this node
  *
  * @param Element $currentSibling
  * @param int $baselineScoreForSiblingParagraphs
  *
  * @return Element[]
  */
 private function getSiblingContent(Element $currentSibling, $baselineScoreForSiblingParagraphs)
 {
     $text = trim($currentSibling->text());
     if ($currentSibling->is('p, strong') && !empty($text)) {
         return [$currentSibling];
     }
     $results = [];
     $nodes = $currentSibling->find('p, strong');
     foreach ($nodes as $node) {
         $text = trim($node->text());
         if (!empty($text)) {
             $wordStats = $this->config()->getStopWords()->getStopwordCount($text);
             if ($baselineScoreForSiblingParagraphs * self::$SIBLING_BASE_LINE_SCORE < $wordStats->getStopWordCount()) {
                 $results[] = $node->document()->createElement('p', $text);
             }
         }
     }
     return $results;
 }