/** * Checks the density of links within a node, is there not much text and most of it contains linky shit? * if so it's no good * * @param Element $node * @param double $limit * * @return bool */ private function isHighLinkDensity(Element $node, $limit = 1.0) { $links = $node->find('a, [onclick]'); if ($links->count() == 0) { return false; } $words = preg_split('@[\\s]+@iu', $node->text(), -1, PREG_SPLIT_NO_EMPTY); if (count($words) == 0) { return false; } $sb = []; foreach ($links as $link) { $sb[] = Helper::textNormalise($link->text()); } $linkText = implode('', $sb); $linkWords = explode(' ', $linkText); $numberOfLinkWords = count($linkWords); $numberOfLinks = $links->count(); $linkDivisor = $numberOfLinkWords / count($words); $score = $linkDivisor * $numberOfLinks; if ($score >= $limit) { return true; } return false; }
/** * Pulls out links we like * * @return string[] */ private function getLinks() { $goodLinks = []; $candidates = $this->article()->getTopNode()->parent()->find('a[href]'); foreach ($candidates as $el) { if ($el->attr('href') != '#' && trim($el->attr('href')) != '') { $goodLinks[] = ['url' => $el->attr('href'), 'text' => Helper::textNormalise($el->text())]; } } return $goodLinks; }
/** * @param Element $topNode * * @return bool */ private function isTableTagAndNoParagraphsExist(Element $topNode) { $this->removeSmallParagraphs($topNode); $nodes = $topNode->find('p'); if ($nodes->count() == 0 && $topNode->is(':not(td)')) { if ($topNode->is('ul, ol')) { $linkTextLength = array_sum(array_map(function ($value) { return mb_strlen(Helper::textNormalise($value->text())); }, $topNode->find('a')->toArray())); $elementTextLength = mb_strlen(Helper::textNormalise($topNode->text())); if ($elementTextLength > 0 && $linkTextLength / $elementTextLength < 0.5) { return false; } } return true; } return false; }
/** * Get article title * * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier * * @return string */ private function getTitle() { $openGraph = $this->article()->getOpenGraph(); // Rely on OpenGraph in case we have the data if (isset($openGraph['title'])) { return $this->cleanTitle($openGraph['title']); } $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'headline'); if ($nodes->count()) { return $this->cleanTitle($nodes->first()->attr('content')); } $nodes = $this->article()->getDoc()->find('html > head > title'); if ($nodes->count()) { return $this->cleanTitle(Helper::textNormalise($nodes->first()->text())); } return ''; }