/** * Extract the content of the page if it exists * * @param \DOMDocument $document * @return string|false */ protected function parseBody(\DOMDocument $document) { // Replace images with their sources /** @var \DOMElement[] $images */ $images = $document->getElementsByTagName('img'); foreach ($images as $image) { $src = 'img' . implode('', parent::extract($image->getAttribute('src'))); $image->parentNode->replaceChild($document->createElement('span', $src), $image); } // Extract raw text /** @var \DOMElement $node */ $node = $document->getElementsByTagName('body')->item(0); if (!$node) { throw new \RuntimeException(); } return $node->nodeValue; }
public function testExtract() { $extractor = new SimpleTextExtractor(); static::assertEquals(['mary', 'is', 'very', 'tall', 'she', 'was', 'in', 'the', '9th', 'grade'], $extractor->extract('Mary is very tall. She was in the 9th grade.')); }