예제 #1
1
 /**
  * Checks the density of links within a node, is there not much text and most of it contains linky shit?
  * if so it's no good
  *
  * @param Element $node
  * @param double $limit
  *
  * @return bool
  */
 private function isHighLinkDensity(Element $node, $limit = 1.0)
 {
     $links = $node->find('a, [onclick]');
     if ($links->count() == 0) {
         return false;
     }
     $words = preg_split('@[\\s]+@iu', $node->text(), -1, PREG_SPLIT_NO_EMPTY);
     if (count($words) == 0) {
         return false;
     }
     $sb = [];
     foreach ($links as $link) {
         $sb[] = Helper::textNormalise($link->text());
     }
     $linkText = implode('', $sb);
     $linkWords = explode(' ', $linkText);
     $numberOfLinkWords = count($linkWords);
     $numberOfLinks = $links->count();
     $linkDivisor = $numberOfLinkWords / count($words);
     $score = $linkDivisor * $numberOfLinks;
     if ($score >= $limit) {
         return true;
     }
     return false;
 }
예제 #2
0
 /**
  * @param string $url
  * @param string|null $rawHTML
  *
  * @return Article
  */
 public function crawl($url, $rawHTML = null)
 {
     $article = new Article();
     $parseCandidate = Helper::getCleanedUrl($url);
     $xmlInternalErrors = libxml_use_internal_errors(true);
     if (empty($rawHTML)) {
         $rawHTML = $this->getHTML($parseCandidate->url);
     }
     // Generate document
     $doc = $this->getDocument($rawHTML);
     // Set core mutators
     $article->setFinalUrl($parseCandidate->url);
     $article->setDomain($parseCandidate->parts->host);
     $article->setLinkhash($parseCandidate->linkhash);
     $article->setRawHtml($rawHTML);
     $article->setDoc($doc);
     $article->setRawDoc(clone $doc);
     // Pre-extraction document cleaning
     $this->modules('cleaners', $article);
     // Extract content
     $this->modules('extractors', $article);
     // Post-extraction content formatting
     $this->modules('formatters', $article);
     libxml_use_internal_errors($xmlInternalErrors);
     return $article;
 }
예제 #3
0
 /**
  * Pulls out links we like
  *
  * @return string[]
  */
 private function getLinks()
 {
     $goodLinks = [];
     $candidates = $this->article()->getTopNode()->parent()->find('a[href]');
     foreach ($candidates as $el) {
         if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
             $goodLinks[] = ['url' => $el->attr('href'), 'text' => Helper::textNormalise($el->text())];
         }
     }
     return $goodLinks;
 }
예제 #4
0
 /**
  * @param Element $topNode
  *
  * @return bool
  */
 private function isTableTagAndNoParagraphsExist(Element $topNode)
 {
     $this->removeSmallParagraphs($topNode);
     $nodes = $topNode->find('p');
     if ($nodes->count() == 0 && $topNode->is(':not(td)')) {
         if ($topNode->is('ul, ol')) {
             $linkTextLength = array_sum(array_map(function ($value) {
                 return mb_strlen(Helper::textNormalise($value->text()));
             }, $topNode->find('a')->toArray()));
             $elementTextLength = mb_strlen(Helper::textNormalise($topNode->text()));
             if ($elementTextLength > 0 && $linkTextLength / $elementTextLength < 0.5) {
                 return false;
             }
         }
         return true;
     }
     return false;
 }
예제 #5
0
 /**
  * @expectedException Goose\Exceptions\MalformedURLException
  */
 public function testGetCleanedUrlException()
 {
     Helper::getCleanedUrl('http://example.org:port/');
 }
예제 #6
0
 /**
  * Get article title
  *
  * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
  * 
  * @return string
  */
 private function getTitle()
 {
     $openGraph = $this->article()->getOpenGraph();
     // Rely on OpenGraph in case we have the data
     if (isset($openGraph['title'])) {
         return $this->cleanTitle($openGraph['title']);
     }
     $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'headline');
     if ($nodes->count()) {
         return $this->cleanTitle($nodes->first()->attr('content'));
     }
     $nodes = $this->article()->getDoc()->find('html > head > title');
     if ($nodes->count()) {
         return $this->cleanTitle(Helper::textNormalise($nodes->first()->text()));
     }
     return '';
 }