/** * @param string $url * @param array $tags * * @return WatchLink */ public function extract(string $url, array $tags) : WatchLink { $watchLink = new WatchLink(); $watchLink->setUrl($url); $this->crawler->clear(); $this->crawler->addHtmlContent($this->fetcher->fetch($url)); $watchLink->setName($this->extractTitle()); $watchLink->setDescription($this->extractDescription()); $watchLink->setImage($this->extractImage()); foreach ($tags as $tag) { $watchLink->addTag($this->tagRepository->findOrCreate($tag)); } return $watchLink; }
/** * {@inheritDoc} */ public function crawl($html) { $crawler = new Crawler(); $crawler->clear(); $crawler->addHtmlContent($html); $movieInfo = $crawler->filter('#overview-top')->each(function (Crawler $domCrawler) { $title = $domCrawler->filter('.header span')->first()->text(); $year = $domCrawler->filter('.header span')->last()->text(); return ['title' => $title, 'year' => $year, 'rating' => $domCrawler->filter('.star-box .giga-star ')->first()->text(), 'desc' => $domCrawler->filter('p.description')->text()]; }); }
/** * @param $html * @return array */ public function load($html) { $metaTags = []; $this->crawler->clear(); $this->crawler->addHtmlContent($html); $this->crawler->filter('meta')->each(function (Crawler $node) { $name = strtolower($node->attr('name')); $content = $node->attr('content'); $metaTags[$name] = $content; }); $links = []; $this->crawler->filter('a')->each(function (Crawler $link) use(&$links) { $rel = $link->attr('rel'); if ('nofollow' === strtolower($rel)) { return false; } $links[] = $link->attr('href'); return $link; }); $this->links = array_unique($links); $this->metaTags = $metaTags; return ['links' => $this->links, 'meta' => $metaTags]; }
/** * @param Position $position * @param string $content */ public function getPositionData(Position $position, $content) { $crawler = new Crawler($content); $trs = $crawler->filter('#full-props-list tr'); $data = []; if ($trs->count() > 0) { foreach ($trs as $tr) { $crawler->clear(); $crawler->addNode($tr); $th = $crawler->filter('th'); $td = $crawler->filter('td'); if ($th->count() > 0 && $td->count() > 0) { $data[trim($th->text())] = trim($td->text()); } } } $position->setAttributes($data); }
public function handle() { $this->jobLogInfo('dmm start'); $crawler = new Crawler(); $guzzle = new Guzzle(['timeout' => 10, 'cookies' => true]); $jar = new CookieJar(); \App\Model\Dmm::all()->map(function ($dmm) use($guzzle, $jar, $crawler) { $html = $guzzle->get($dmm->href, ['cookies' => $jar])->getBody(); $crawler->addHtmlContent($html); $crawler->filterXPath('//ul[@id="list"]/li/div[1]//p[@class="tmb"]/a')->each(function (Crawler $crawle) use($dmm) { $title = trim($crawle->text()); $href = $crawle->attr('href'); $img_url = $crawle->filterXPath('//span[1]/img')->attr('src'); DmmList::add($dmm->id, $title, $href, $img_url); echo "{$href}\n"; }); $crawler->clear(); }); $this->jobLogInfo('crawl end'); }
public function testClear() { $doc = new \DOMDocument(); $node = $doc->createElement('test'); $crawler = new Crawler($node); $crawler->clear(); $this->assertCount(0, $crawler, '->clear() removes all the nodes from the crawler'); }
public function testClear() { $crawler = new Crawler(new \DOMNode()); $crawler->clear(); $this->assertCount(0, $crawler, '->clear() removes all the nodes from the crawler'); }
/** * @param $content string * * @return string */ private function removeLastItem($content) { $document = new \DOMDocument('1.0', \Yii::$app->charset); $crawler = new Crawler(); $crawler->addHTMLContent($content, \Yii::$app->charset); $root = $document->appendChild($document->createElement('_root')); $crawler->rewind(); $root->appendChild($document->importNode($crawler->current(), true)); $domxpath = new \DOMXPath($document); $crawlerInverse = $domxpath->query(CssSelector::toXPath($this->widgetItem . ':last-child')); foreach ($crawlerInverse as $key => $elementToRemove) { $parent = $elementToRemove->parentNode; $parent->removeChild($elementToRemove); } $crawler->clear(); $crawler->add($document); return $crawler->filter('body')->eq(0)->html(); }
/** * @param string $html */ private function setHtml($html) { $this->crawler->clear(); $this->crawler->addHtmlContent($html); }