/** * Save the plain data into a file and hash the resulting filename. * * Note: Overwrites old files and uses existing subdirectories. * * @param Result $result * * @return Result */ public function execute(Result $result) { $subDir = $this->dir . DIRECTORY_SEPARATOR . substr($result->getHash(), 0, 1); $saveTo = $subDir . DIRECTORY_SEPARATOR . $result->getHash(); $fs = new Filesystem(); $fs->dumpFile($saveTo, $result->getData()); return $result; }
/** * Return text that matches a XPath expression. * * @param Result $result * * @return Result */ public function execute(Result $result) { $domCrawler = new DOMCrawler(); $domCrawler->addContent($result->getData()); $domCrawler->filterXPath($this->xpath)->each(function (DOMCrawler $node) { $text = trim($node->text()); if (!empty($text)) { echo $text . PHP_EOL; } }); return $result; }
/** * Save the plain data into a file and hash the resulting filename. * * Note: Overwrites old files and uses existing subdirectories. * * @param Result $result * * @return Result */ public function execute(Result $result) { // strip protocol and www. $url = preg_replace('/^((https?):\\/\\/)?(www\\d{0,3}\\.)?/', '', $result->getLink()); // URLs like blog.com/posts and blog.com/posts/a would conflict, because // the first URL would be created as a file and the same name cannot be used for // a directory with the same name. Workaround is to simply attach an underscore // to the *index* file. if ($this->isPrettyUrl($url)) { $url = $url . '_'; } $saveTo = $this->dir . DIRECTORY_SEPARATOR . $url; $fs = new Filesystem(); $fs->dumpFile($saveTo, $result->getData()); return $result; }
/** * Minify HTML. * * @param Result $result * * @return Result */ public function execute(Result $result) { $result->setData(HTMLMinify::minify($result->getData(), array('optimizationLevel' => HTMLMinify::OPTIMIZATION_ADVANCED))); return $result; }