Example #1
1
 /**
  * Checks the density of links within a node, is there not much text and most of it contains linky shit?
  * if so it's no good
  *
  * @param Element $node
  * @param double $limit
  *
  * @return bool
  */
 private function isHighLinkDensity(Element $node, $limit = 1.0)
 {
     $links = $node->find('a, [onclick]');
     if ($links->count() == 0) {
         return false;
     }
     $words = preg_split('@[\\s]+@iu', $node->text(), -1, PREG_SPLIT_NO_EMPTY);
     if (count($words) == 0) {
         return false;
     }
     $sb = [];
     foreach ($links as $link) {
         $sb[] = Helper::textNormalise($link->text());
     }
     $linkText = implode('', $sb);
     $linkWords = explode(' ', $linkText);
     $numberOfLinkWords = count($linkWords);
     $numberOfLinks = $links->count();
     $linkDivisor = $numberOfLinkWords / count($words);
     $score = $linkDivisor * $numberOfLinks;
     if ($score >= $limit) {
         return true;
     }
     return false;
 }
Example #2
1
 /**
  * @param Element $node
  *
  * @return LocallyStoredImage[]
  */
 private function getImageCandidates(Element $node)
 {
     $images = $node->find('img');
     $filteredImages = $this->filterBadNames($images);
     $goodImages = $this->findImagesThatPassByteSizeTest($filteredImages);
     return $goodImages;
 }
Example #3
0
 /**
  * we could have long articles that have tons of paragraphs so if we tried to calculate the base score against
  * the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring
  * of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of
  * 100 then 100 should be our base.
  *
  * @param Element $topNode
  *
  * @return int
  */
 private function getBaselineScoreForSiblings(Element $topNode)
 {
     $base = 100000;
     $numberOfParagraphs = 0;
     $scoreOfParagraphs = 0;
     $nodesToCheck = $topNode->find('p, strong');
     foreach ($nodesToCheck as $node) {
         $nodeText = $node->text();
         $wordStats = $this->config()->getStopWords()->getStopwordCount($nodeText);
         $highLinkDensity = $this->isHighLinkDensity($node);
         if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) {
             $numberOfParagraphs += 1;
             $scoreOfParagraphs += $wordStats->getStopWordCount();
         }
     }
     if ($numberOfParagraphs > 0) {
         $base = $scoreOfParagraphs / $numberOfParagraphs;
     }
     return $base;
 }