/** * Returns an array with information and download * link for each subtitle on the given HTML * * @param string * @return array */ private function getSubtitlesFromHtml($html) { $crawler = new \Symfony\Component\DomCrawler\Crawler($html); $subs = array(); $allSubsInPage = $crawler->filterXPath('//*[@id="buscador_detalle"]'); $allTitlesInPage = $crawler->filterXPath('//*[@id="menu_titulo_buscador"]'); $allTitlesInPage->filterXPath('//a')->each(function ($node, $i) use(&$subs) { $subs[$i + 1]['sub_link'] = $node->attr('href'); }); // get description of each sub $allSubsInPage->filterXPath('//*[@id="buscador_detalle_sub"]')->each(function ($node, $i) use(&$subs) { $text = trim(str_replace("(adsbygoogle = window.adsbygoogle || []).push({});", "", $node->text())); $subs[$i + 1]['description'] = $text; }); return $subs; }
public function readTitle($url) { if (!$this->enabled) { return null; } try { $content = $this->fetchUrl($url); } catch (\Exception $e) { \Yii::getLogger()->log("Crawler fetchUrl exception: {$e->getMessage()}", Logger::LEVEL_ERROR); return null; } try { $crawler = new \Symfony\Component\DomCrawler\Crawler(); $crawler->addHtmlContent($content); $node = $crawler->filterXPath('html/head/title'); if ($node->count() > 0) { return $node->first()->text(); } } catch (\Exception $e) { \Yii::getLogger()->log("Crawler DOM extraction exception: {$e->getMessage()}", Logger::LEVEL_ERROR); } return null; }
/** * Searches for an input element of type checkbox with the name $name using * $crawler. Contains an assertion that only one such checkbox exists within * the scope of $crawler. * * @param Symfony\Component\DomCrawler\Crawler $crawler * @param string $name * @param string $message * * @return Symfony\Component\DomCrawler\Crawler */ public function assert_find_one_checkbox($crawler, $name, $message = '') { $query = sprintf('//input[@type="checkbox" and @name="%s"]', $name); $result = $crawler->filterXPath($query); $this->assertEquals(1, sizeof($result), $message ?: 'Failed asserting that exactly one checkbox with name' . " {$name} exists in crawler scope."); return $result; }
/** * {@inheritdoc} */ protected function getMetadataThumbnail(MediaInterface $media) { $url = sprintf('http://videos.sapo.pt/%s', $media->getProviderReference()); try { $html = $this->browser->get($url)->getContent(); /* $c = curl_init($url); curl_setopt($c, CURLOPT_RETURNTRANSFER, true); //curl_setopt(... other options you want...) $html = curl_exec($c); if (curl_error($c)){ die(curl_error($c)); } // Get the status code $status = curl_getinfo($c, CURLINFO_HTTP_CODE); curl_close($c); */ } catch (\RuntimeException $e) { throw new \RuntimeException('Unable to retrieve the thumbnail information for :' . $url, null, $e); } $crawler = new \Symfony\Component\DomCrawler\Crawler($html); $metadata = []; //requires symfony css selector //$thumbnail_node = $crawler->filter('link[itemprop="thumbnailUrl"]'); //use xpath $thumbnail_node = $crawler->filterXPath('//*[@id="body_content"]/div/article/div[2]/link[2]'); if ($thumbnail_node->count() === 1) { //http://thumbs.web.sapo.io/?pic=http://cache04.stormap.sapo.pt/vidstore18/thumbnais/54/88/76/11128693_4b5Bb.jpg&crop=center&tv=2&W=1280&H=960&errorpic=http://assets.web.sapo.io/sapovideo/sv/20150903/imgs/playlist_default_thumb_error_pt.gif $thumbnail_url = $thumbnail_node->getNode(0)->getAttribute('href'); $parsed_url = parse_url($thumbnail_url); $data = []; parse_str($parsed_url['query'], $data); if (isset($data['pic'])) { $metadata['thumbnail_url'] = $data['pic']; } } if (empty($metadata)) { throw new \RuntimeException('Unable to decode the video information for :' . $url); } return $metadata; }
#!/usr/bin/php <?php $config = ['require_services' => ['sf_css_selector'], 'git_urls' => ['https://github.com/yfix/DomCrawler.git' => 'sf_dom_crawler/'], 'autoload_config' => ['sf_dom_crawler/' => 'Symfony\\Component\\DomCrawler'], 'example' => function () { $crawler = new \Symfony\Component\DomCrawler\Crawler(); $crawler->addContent('<html><body><p>Hello World!</p></body></html>'); echo $crawler->filterXPath('descendant-or-self::body/p')->text(); echo PHP_EOL; echo $crawler->filter('body > p')->text(); // require css selector echo PHP_EOL; }]; if ($return_config) { return $config; } require_once __DIR__ . '/_yf_autoloader.php'; new yf_autoloader($config);