public function process(TextDocument $doc) { $i = 0; $title = -1; $contentStart = -1; foreach ($doc->getTextBlocks() as $tb) { if ($contentStart == -1 && $tb->hasLabel(TextLabels::TITLE)) { $title = $i; $contentStart = -1; } if ($contentStart == -1 && $tb->isContent()) { $contentStart = $i; } $i++; } if ($contentStart <= $title || $title == -1) { return false; } $changes = false; foreach ($doc->getTextBlocks() as $key => $tb) { if ($key < $title) { continue; } if ($key > $contentStart) { continue; } if ($tb->hasLabel(TextLabels::MIGHT_BE_CONTENT)) { $changes = $tb->setIsContent(true) || $changes; } } return $changes; }
public function process(TextDocument $doc) { $changes = false; $textBlocks = $doc->getTextBlocks(); foreach ($textBlocks as $tb) { if (!$tb->isContent() && ($this->labelToKeep == null || !$tb->hasLabel($this->labelToKeep))) { $doc->removeTextBlock($tb); $changes = true; } } return $changes; }
public function process(TextDocument $doc) { $textBlocks = $doc->getTextBlocks(); if (count($textBlocks) < 2) { return false; } $changes = false; $offset = 0; if ($this->contentOnly) { $prevBlock = null; foreach ($textBlocks as $tb) { $offset++; if ($tb->isContent()) { $prevBlock = $tb; break; } } } else { $prevBlock = $textBlocks[0]; $offset = 1; } for ($i = $offset, $l = count($textBlocks); $i < $l; $i++) { $tb = $textBlocks[$i]; if (!$tb->isContent()) { continue; } $diffBlocks = $tb->getStartOffset() - $prevBlock->getEndOffset() - 1; if ($diffBlocks <= $this->maxBlocksDistance) { $ok = true; if ($this->contentOnly) { if (!$prevBlock->isContent() || !$tb->isContent()) { $ok = false; } } if ($ok && $this->sameTagLevelOnly && $prevBlock->getLevel() != $tb->getLevel()) { $ok = false; } if ($ok) { $prevBlock->mergeNext($tb); $doc->removeTextBlock($tb); $changes = true; } else { $prevBlock = $tb; } } else { $prevBlock = $tb; } } return $changes; }
public function process(TextDocument $doc) { $changes = false; $level = -1; foreach ($doc->getTextBlocks() as $tb) { if ($tb->isContent() && $tb->hasLabel(TextLabels::MIGHT_BE_CONTENT)) { $level = $tb->getLevel(); break; } } if ($level == -1) { return false; } foreach ($doc->getTextBlocks() as $tb) { if (!$tb->isContent()) { if ($tb->getWordCount() >= 100 && $tb->getLevel() == $level) { $tb->setIsContent(true); $changes = true; } } } return $changes; }
public function process(TextDocument $doc) { $changes = false; $level = PHP_INT_MAX; foreach ($doc->getTextBlocks() as $tb) { if ($tb->isContent() && $tb->hasLabel(TextLabels::VERY_LIKELY_CONTENT)) { $level = $tb->getLevel(); } else { if ($tb->getLevel() > $level && $tb->hasLabel(TextLabels::MIGHT_BE_CONTENT) && $tb->hasLabel(TextLabels::LI) && $tb->getLinkDensity() == 0) { $tb->setIsContent(true); $changes = true; } else { $level = PHP_INT_MAX; } } } return $changes; }
public function process(TextDocument $doc) { $change = false; $wordCount = 0; $foundEndOfText = false; foreach ($doc->getTextBlocks() as $tb) { $endOfText = $tb->hasLabel(TextLabels::INDICATES_END_OF_TEXT); if ($tb->isContent()) { $wordCount += $this->getFullTextWordCount($tb); } if ($endOfText && $wordCount >= $this->minWordCount) { $foundEndOfText = true; } if ($foundEndOfText) { $change = true; $tb->setIsContent(false); } } return $change; }
public function process(TextDocument $doc) { $change = false; /** * @var TextBlock[] $textBlocks */ $textBlocks = $doc->getTextBlocks(); $textBlocks = array_reverse($textBlocks); foreach ($textBlocks as $tb) { if ($tb->isContent()) { if ($tb->hasLabel(TextLabels::HEADING)) { $tb->setIsContent(false); $change = true; } else { break; } } } return $change; }
public function process(TextDocument $doc) { $curr = new TextBlock(); $next = new TextBlock(); $change = false; foreach ($doc->getTextBlocks() as $tb) { $prev = $curr; $curr = $next; $next = $tb; $change = $this->classify($prev, $curr, $next) || $change; } $prev = $curr; $curr = $next; $next = new TextBlock(); $change = $this->classify($prev, $curr, $next) || $change; $prev = $curr; $curr = $next; $next = new TextBlock(); $change = $this->classify($prev, $curr, $next) || $change; return $change; }
public function process(TextDocument $doc) { $change = false; foreach ($doc->getTextBlocks() as $tb) { $result = false; $wordCount = $tb->getWordCount(); if ($wordCount < 15) { $text = strtolower(trim($tb->getText())); $length = mb_strlen($text); if ($length > 7) { $result = "thanks for your comments - this feedback is now closed" === $text || $this->startWith($text, "comments") || $this->startWith($text, "© reuters") || $this->startWith($text, "please rate this") || $this->startWith($text, "post a comment") || $this->startWithNumber($text, [" comments", " users responded in"]) || $this->contains($text, "what you think...") || $this->contains($text, "add your comment") || $this->contains($text, "add comment") || $this->contains($text, "reader views") || $this->contains($text, "have your say") || $this->contains($text, "reader comments") || $this->contains($text, "rätta artikeln"); } else { if (1 == $tb->getLinkDensity()) { $result = "comment" == $text; } } } if ($result) { $tb->addLabel(TextLabels::INDICATES_END_OF_TEXT); } $change = $change || $result; } return $change; }
public function process(TextDocument $doc) { $title = $this->clear($doc->getTitle()); $potentialTitles = $this->getPotentialTitles($title); if (!$potentialTitles) { return false; } $change = false; foreach ($doc->getTextBlocks() as $tb) { $text = $this->clear($tb->getText()); if (isset($potentialTitles[$text])) { $tb->addLabel(TextLabels::TITLE); $change = true; break; } $text = trim(preg_replace('/[\\?\\!\\.\\-\\:]+/', '', $text)); if (isset($potentialTitles[$text])) { $tb->addLabel(TextLabels::TITLE); $change = true; break; } } return $change; }
public function process(TextDocument $doc) { $blocks = $doc->getTextBlocks(); if (count($blocks) < 2) { return false; } $maxNumWords = -1; $largestBlock = null; $level = -1; $i = 0; $n = -1; foreach ($doc->getTextBlocks() as $tb) { $wc = $tb->getWordCount(); if ($wc > $maxNumWords) { $largestBlock = $tb; $maxNumWords = $wc; $n = $i; if ($this->expandToSameLevelText) { $level = $tb->getLevel(); } } $i++; } foreach ($doc->getTextBlocks() as $tb) { if ($tb == $largestBlock) { $tb->setIsContent(true); $tb->addLabel(TextLabels::VERY_LIKELY_CONTENT); } else { $tb->setIsContent(false); $tb->addLabel(TextLabels::MIGHT_BE_CONTENT); } } if ($this->expandToSameLevelText && $n != -1) { for ($i = $n; $i >= 0; $i--) { $tb = $blocks[$i]; $tl = $tb->getLevel(); if ($tl < $level) { break; } else { if ($tl == $level) { if ($tb->getWordCount() >= $this->minWords) { $tb->setIsContent(true); } } } } for ($i = $n, $l = count($blocks); $i < $l; $i++) { $tb = $blocks[$i]; $tl = $tb->getLevel(); if ($tl < $level) { break; } else { if ($tl == $level) { if ($tb->getWordCount() >= $this->minWords) { $tb->setIsContent(true); } } } } } return true; }