public function process(TextDocument $doc)
 {
     $i = 0;
     $title = -1;
     $contentStart = -1;
     foreach ($doc->getTextBlocks() as $tb) {
         if ($contentStart == -1 && $tb->hasLabel(TextLabels::TITLE)) {
             $title = $i;
             $contentStart = -1;
         }
         if ($contentStart == -1 && $tb->isContent()) {
             $contentStart = $i;
         }
         $i++;
     }
     if ($contentStart <= $title || $title == -1) {
         return false;
     }
     $changes = false;
     foreach ($doc->getTextBlocks() as $key => $tb) {
         if ($key < $title) {
             continue;
         }
         if ($key > $contentStart) {
             continue;
         }
         if ($tb->hasLabel(TextLabels::MIGHT_BE_CONTENT)) {
             $changes = $tb->setIsContent(true) || $changes;
         }
     }
     return $changes;
 }
 public function process(TextDocument $doc)
 {
     $changes = false;
     $textBlocks = $doc->getTextBlocks();
     foreach ($textBlocks as $tb) {
         if (!$tb->isContent() && ($this->labelToKeep == null || !$tb->hasLabel($this->labelToKeep))) {
             $doc->removeTextBlock($tb);
             $changes = true;
         }
     }
     return $changes;
 }
 public function process(TextDocument $doc)
 {
     $textBlocks = $doc->getTextBlocks();
     if (count($textBlocks) < 2) {
         return false;
     }
     $changes = false;
     $offset = 0;
     if ($this->contentOnly) {
         $prevBlock = null;
         foreach ($textBlocks as $tb) {
             $offset++;
             if ($tb->isContent()) {
                 $prevBlock = $tb;
                 break;
             }
         }
     } else {
         $prevBlock = $textBlocks[0];
         $offset = 1;
     }
     for ($i = $offset, $l = count($textBlocks); $i < $l; $i++) {
         $tb = $textBlocks[$i];
         if (!$tb->isContent()) {
             continue;
         }
         $diffBlocks = $tb->getStartOffset() - $prevBlock->getEndOffset() - 1;
         if ($diffBlocks <= $this->maxBlocksDistance) {
             $ok = true;
             if ($this->contentOnly) {
                 if (!$prevBlock->isContent() || !$tb->isContent()) {
                     $ok = false;
                 }
             }
             if ($ok && $this->sameTagLevelOnly && $prevBlock->getLevel() != $tb->getLevel()) {
                 $ok = false;
             }
             if ($ok) {
                 $prevBlock->mergeNext($tb);
                 $doc->removeTextBlock($tb);
                 $changes = true;
             } else {
                 $prevBlock = $tb;
             }
         } else {
             $prevBlock = $tb;
         }
     }
     return $changes;
 }
Пример #4
0
 public function process(TextDocument $doc)
 {
     $changes = false;
     $level = PHP_INT_MAX;
     foreach ($doc->getTextBlocks() as $tb) {
         if ($tb->isContent() && $tb->hasLabel(TextLabels::VERY_LIKELY_CONTENT)) {
             $level = $tb->getLevel();
         } else {
             if ($tb->getLevel() > $level && $tb->hasLabel(TextLabels::MIGHT_BE_CONTENT) && $tb->hasLabel(TextLabels::LI) && $tb->getLinkDensity() == 0) {
                 $tb->setIsContent(true);
                 $changes = true;
             } else {
                 $level = PHP_INT_MAX;
             }
         }
     }
     return $changes;
 }
 public function process(TextDocument $doc)
 {
     $change = false;
     $wordCount = 0;
     $foundEndOfText = false;
     foreach ($doc->getTextBlocks() as $tb) {
         $endOfText = $tb->hasLabel(TextLabels::INDICATES_END_OF_TEXT);
         if ($tb->isContent()) {
             $wordCount += $this->getFullTextWordCount($tb);
         }
         if ($endOfText && $wordCount >= $this->minWordCount) {
             $foundEndOfText = true;
         }
         if ($foundEndOfText) {
             $change = true;
             $tb->setIsContent(false);
         }
     }
     return $change;
 }
 public function process(TextDocument $doc)
 {
     $change = false;
     /**
      * @var TextBlock[] $textBlocks
      */
     $textBlocks = $doc->getTextBlocks();
     $textBlocks = array_reverse($textBlocks);
     foreach ($textBlocks as $tb) {
         if ($tb->isContent()) {
             if ($tb->hasLabel(TextLabels::HEADING)) {
                 $tb->setIsContent(false);
                 $change = true;
             } else {
                 break;
             }
         }
     }
     return $change;
 }
 public function process(TextDocument $doc)
 {
     $curr = new TextBlock();
     $next = new TextBlock();
     $change = false;
     foreach ($doc->getTextBlocks() as $tb) {
         $prev = $curr;
         $curr = $next;
         $next = $tb;
         $change = $this->classify($prev, $curr, $next) || $change;
     }
     $prev = $curr;
     $curr = $next;
     $next = new TextBlock();
     $change = $this->classify($prev, $curr, $next) || $change;
     $prev = $curr;
     $curr = $next;
     $next = new TextBlock();
     $change = $this->classify($prev, $curr, $next) || $change;
     return $change;
 }
 public function process(TextDocument $doc)
 {
     $changes = false;
     $level = -1;
     foreach ($doc->getTextBlocks() as $tb) {
         if ($tb->isContent() && $tb->hasLabel(TextLabels::MIGHT_BE_CONTENT)) {
             $level = $tb->getLevel();
             break;
         }
     }
     if ($level == -1) {
         return false;
     }
     foreach ($doc->getTextBlocks() as $tb) {
         if (!$tb->isContent()) {
             if ($tb->getWordCount() >= 100 && $tb->getLevel() == $level) {
                 $tb->setIsContent(true);
                 $changes = true;
             }
         }
     }
     return $changes;
 }
 public function process(TextDocument $doc)
 {
     $change = false;
     foreach ($doc->getTextBlocks() as $tb) {
         $result = false;
         $wordCount = $tb->getWordCount();
         if ($wordCount < 15) {
             $text = strtolower(trim($tb->getText()));
             $length = mb_strlen($text);
             if ($length > 7) {
                 $result = "thanks for your comments - this feedback is now closed" === $text || $this->startWith($text, "comments") || $this->startWith($text, "© reuters") || $this->startWith($text, "please rate this") || $this->startWith($text, "post a comment") || $this->startWithNumber($text, [" comments", " users responded in"]) || $this->contains($text, "what you think...") || $this->contains($text, "add your comment") || $this->contains($text, "add comment") || $this->contains($text, "reader views") || $this->contains($text, "have your say") || $this->contains($text, "reader comments") || $this->contains($text, "rätta artikeln");
             } else {
                 if (1 == $tb->getLinkDensity()) {
                     $result = "comment" == $text;
                 }
             }
         }
         if ($result) {
             $tb->addLabel(TextLabels::INDICATES_END_OF_TEXT);
         }
         $change = $change || $result;
     }
     return $change;
 }
 public function process(TextDocument $doc)
 {
     $title = $this->clear($doc->getTitle());
     $potentialTitles = $this->getPotentialTitles($title);
     if (!$potentialTitles) {
         return false;
     }
     $change = false;
     foreach ($doc->getTextBlocks() as $tb) {
         $text = $this->clear($tb->getText());
         if (isset($potentialTitles[$text])) {
             $tb->addLabel(TextLabels::TITLE);
             $change = true;
             break;
         }
         $text = trim(preg_replace('/[\\?\\!\\.\\-\\:]+/', '', $text));
         if (isset($potentialTitles[$text])) {
             $tb->addLabel(TextLabels::TITLE);
             $change = true;
             break;
         }
     }
     return $change;
 }
 public function process(TextDocument $doc)
 {
     $blocks = $doc->getTextBlocks();
     if (count($blocks) < 2) {
         return false;
     }
     $maxNumWords = -1;
     $largestBlock = null;
     $level = -1;
     $i = 0;
     $n = -1;
     foreach ($doc->getTextBlocks() as $tb) {
         $wc = $tb->getWordCount();
         if ($wc > $maxNumWords) {
             $largestBlock = $tb;
             $maxNumWords = $wc;
             $n = $i;
             if ($this->expandToSameLevelText) {
                 $level = $tb->getLevel();
             }
         }
         $i++;
     }
     foreach ($doc->getTextBlocks() as $tb) {
         if ($tb == $largestBlock) {
             $tb->setIsContent(true);
             $tb->addLabel(TextLabels::VERY_LIKELY_CONTENT);
         } else {
             $tb->setIsContent(false);
             $tb->addLabel(TextLabels::MIGHT_BE_CONTENT);
         }
     }
     if ($this->expandToSameLevelText && $n != -1) {
         for ($i = $n; $i >= 0; $i--) {
             $tb = $blocks[$i];
             $tl = $tb->getLevel();
             if ($tl < $level) {
                 break;
             } else {
                 if ($tl == $level) {
                     if ($tb->getWordCount() >= $this->minWords) {
                         $tb->setIsContent(true);
                     }
                 }
             }
         }
         for ($i = $n, $l = count($blocks); $i < $l; $i++) {
             $tb = $blocks[$i];
             $tl = $tb->getLevel();
             if ($tl < $level) {
                 break;
             } else {
                 if ($tl == $level) {
                     if ($tb->getWordCount() >= $this->minWords) {
                         $tb->setIsContent(true);
                     }
                 }
             }
         }
     }
     return true;
 }