The XPath expression is evaluated in the context of the crawler, which
is considered as a fake parent of the elements inside it.
This means that a child selector "div" or "./div" will match only
the div elements of the current crawler, not their children.
public filterXPath ( string $xpath ) : |
||
$xpath | string | An XPath expression |
return | A new instance of Crawler with the filtered list of nodes |
/** * Returns a method in the current specification from a DOMNode * * @param \DOMNode $node A DOMNode * * @return Method */ public function getMethod(\DOMNode $node) { $crawler = new Crawler($node); $name = $crawler->attr('name'); // Initialize $method = new Method($name); // Type $method->setType(preg_match('/(^(get|is)|ToString$)/', $name) ? Method::TYPE_ACCESSOR : Method::TYPE_ACTION); // Description $descriptions = $crawler->filterXPath('//comment'); if (count($descriptions) !== 1) { throw new \Exception('Only one comment expected'); } $descriptions->rewind(); $description = $this->getInner($descriptions->current()); $method->setDescription($description); // Parameters foreach ($crawler->filterXPath('//parameter') as $node) { $method->addParameter($this->getParameter($node)); } // Return $returnNodes = $crawler->filterXPath('//return'); if (count($returnNodes) > 1) { throw new \Exception("Should not be more than one return node"); } elseif (count($returnNodes) == 1) { $returnNodes->rewind(); list($type, $description) = $this->getReturn($returnNodes->current()); $method->setReturnType($type); $method->setReturnDescription($description); } return $method; }
/** * Search for torrents. * * @param string $query * @param int $category * @return array Array of torrents. Either empty or filled. */ public function search($query, $category) { # Set single-cell view for torrents. $requestOptions = ['headers' => ['User-Agent' => 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36']]; try { $url = $this->makeUrl($query, $category); $response = $this->httpClient->get($url, $requestOptions); $crawler = new Crawler((string) $response->getBody()); } catch (\Exception $e) { return []; } $items = $crawler->filterXpath('//channel/item'); $torrents = []; foreach ($items as $item) { $torrent = new Torrent(); $itemCrawler = new Crawler($item); // Set details for torrent. $torrent->setSite($this->tag); $torrent->setTitle($itemCrawler->filterXpath('//title')->text()); $torrent->setSeeders((int) $itemCrawler->filterXpath('//torrent:seeds')->text()); $torrent->setLeechers((int) $itemCrawler->filterXpath('//torrent:peers')->text()); $torrent->setMagnet($itemCrawler->filterXpath('//torrent:magnetURI')->text()); $torrent->setSize($this->formatBytes((int) $itemCrawler->filterXPath('//torrent:contentLength')->text())); $torrent->setAge($itemCrawler->filterXPath('//pubDate')->text()); $torrent->setCategory($itemCrawler->filterXPath('//category')->text()); $torrents[] = $torrent; } return $torrents; }
/** * Парсит карту кафе в формате ['название кафе' => 'ссылка на информацию о кафе'] * * @param string $html * * @return array */ public function parseCafeList($html) { $this->setHtml($html); $cafe_map = []; $this->crawler->filterXPath('//body//ul[@class="xoxo"]//li/a')->each(function (Crawler $node) use(&$cafe_map) { $cafe_map[$node->attr('title')] = $node->attr('href'); }); return $cafe_map; }
/** * @depends testAngularSammui * @param Crawler $crawler */ public function testSammuiClient(Crawler $crawler) { $clientId = $crawler->filterXPath('//html/head/meta[@name="sammui-oauth2-client-id"]')->attr('content'); $clientSecret = $crawler->filterXPath('//html/head/meta[@name="sammui-oauth2-client-secret"]')->attr('content'); $this->assertNotNull($clientId); $this->assertNotNull($clientSecret); $this->assertTrue(is_string($clientId), $clientId); $this->assertTrue(is_string($clientSecret), $clientSecret); $this->assertStringStartsNotWith('no-client-found-for', $clientId); $this->assertStringStartsNotWith('no-client-found-for', $clientSecret); }
/** * @return array */ public function getRobotsData() { $data = strtolower($this->crawler->filterXPath("//meta[@name='robots']")->attr('content')); $splitted = explode(',', $data); $robots = []; foreach ($splitted as $value) { $value = trim($value); if (strlen($value) > 2) { $robots[$value] = $value; } } return $robots; }
public function parse(Crawler $crawler) { $params = array(); $node = $crawler->filter('.text .druh .nabizim'); if ($node->count()) { $params['type'] = trim($node->first()->text()); } $node = $crawler->filter('.text h2 a'); if ($node->count()) { $params['name'] = trim($node->first()->text()); } $node = $crawler->filter('.item .image img'); if ($node->count()) { $params['images'] = 1; $dom = $node->getNode(0); $params['imageUrls'] = array('http://midi.cz' . $node->first()->attr('src')); // Natvrdo ziskavame url } else { $params['images'] = 0; $params['imageUrls'] = array(); } $node = $crawler->filterXPath('//*[@class="table_info"]//tr[3]/td[2]'); if ($node->count()) { $params['region'] = trim($node->first()->text()); } $node = $crawler->filter('.priceBox'); if ($node->count()) { $exploded = explode(' ', trim($node->first()->text())); $params['price'] = $exploded[0]; if (isset($exploded[1])) { $params['currency'] = $exploded[1]; } } // email $node = $crawler->filterXPath('//*[@class="table_info"]//tr[4]/td[2]'); if ($node->count()) { $params['email'] = trim($node->first()->text()); } // telefon $node = $crawler->filterXPath('//*[@class="table_info"]//tr[5]/td[2]'); if ($node->count()) { $params['phone'] = trim($node->first()->text()); } $node = $crawler->filterXPath('//*[@id="mainCol"]/div[1]/div[2]/p[3]'); if ($node->count()) { $params['text'] = trim($node->first()->text()); } return $params; }
public function processContent() { $entryDOM = $this->crawler->filterXPath('//article[contains(@class, "entry")]'); $this->post->content = ''; $entryDOM->children()->each(function (Crawler $node, $i) { $newNode = $this->processNode($node); if ($newNode !== NULL) { if ($newNode->nodeName !== 'a' && isset($newNode->text)) { $nodeName = trim($newNode->nodeName); $this->post->content .= '<' . $nodeName . '>' . $newNode->text . '</' . $nodeName . '>'; } } }); $this->post->content = $this->translator->translate($this->post->content); }
/** * Returns a list of episodes for a specified season of a TV show. * * @param string $id ID as contained in the URL for the TV show of the * form http://www.imdb.com/title/ID/ * @param string $season Season for which to return episodes * @return array Associative array indexed by episode number of * associative arrays each containing data for an individual * episode within the season */ public function getSeasonEpisodes($id, $season) { $crawler = $this->request('GET', $this->baseUrl . '/title/' . $id . '/episodes?season=' . $season); $divs = $crawler->filterXPath('//div[contains(@class, "eplist")]/div[contains(@class, "list_item")]/div[@class="info"]'); $episodes = array(); foreach ($divs as $div) { $div = new Crawler($div); $number = $div->filterXPath('//meta[@itemprop="episodeNumber"]')->attr('content'); $title = $div->filterXPath('//strong/a[@itemprop="name"]')->text(); $airdate = $div->filterXPath('//div[@class="airdate"]')->text(); $description = $div->filterXPath('//div[@class="item_description"]')->text(); $episodes[$number] = array_map('trim', array('title' => $title, 'airdate' => $airdate, 'description' => $description)); } return $episodes; }
/** * Summary. * *@since 0.9.0 * @see * @return array('title', 'image_url') * @author nguyenvanduocit */ public function getRandomFactOfProgramming() { $client = new Client(); $response = $client->get('http://thecodinglove.com/random'); if ($response->getStatusCode() === 200) { $result = array('title' => '', 'image_url' => ''); $crawler = new Crawler($response->getBody()->getContents()); /** * Get the title */ $titleCrawler = $crawler->filterXPath('//div[@id="post1"]//h3'); if ($titleCrawler) { $result['title'] = $titleCrawler->text(); } /** * Get image */ $imageCrawler = $crawler->filterXPath('//div[@class="bodytype"]//img'); if ($imageCrawler) { $result['image_url'] = $imageCrawler->attr('src'); } return $result; } else { return array('title' => 'No image found' . $response->getStatusCode(), 'image_url' => 'http://funny.topdev.vn/wp-content/uploads/images/when-they-tell-me-the-website-has-to-be-supported-by-ie6-1439201300.gif'); } }
/** * Replace all src of img.inline-image with an embedded image * * @param Swift_Message $message */ protected function inlineImages(Swift_Message $message) { $html = $message->getBody(); $crawler = new Crawler(); $crawler->addHtmlContent($html); $imgs = array(); $replaces = array(); foreach ($crawler->filterXPath("//img[contains(concat(' ',normalize-space(@class), ' '), ' inline-image ')]") as $img) { $normalized_src = $src = $img->getAttribute('src'); if (isset($replaces['src="' . $src . '"'])) { continue; } // if starting with one slash, use local file if (preg_match('#^/[^/]#', $normalized_src)) { $normalized_src = $this->web_directory . parse_url($src, PHP_URL_PATH); } if (!isset($imgs[$normalized_src])) { $swift_image = Swift_Image::fromPath($normalized_src); $imgs[$normalized_src] = $message->embed($swift_image); } $replaces['src=\'' . $src . '\''] = 'src="' . $imgs[$normalized_src] . '"'; $replaces['src="' . $src . '"'] = 'src="' . $imgs[$normalized_src] . '"'; } if (count($replaces)) { $html = str_replace(array_keys($replaces), array_values($replaces), $html); $message->setBody($html); } }
/** * @param File $file * @return Sale[] */ public function crawl(File $file) { $sales = []; $crawler = new Crawler(file_get_contents($file->getPathname())); /** @var $saleItem \DOMElement */ foreach ($crawler->filterXPath('//Data/Items/Item') as $saleItem) { $saleObj = new Sale(); $tag = $saleItem->getAttribute('Tag'); $tagEntity = $this->getEm()->getRepository('AffiliateDashboardBundle:Tag')->findbyName($tag); if (!$tagEntity) { $tagEntity = new Tag(); $tagEntity->setName($tag); $this->getEm()->persist($tagEntity); $this->getEm()->flush(); } $saleObj->setAsin($saleItem->getAttribute('ASIN')); $saleObj->setCategory($saleItem->getAttribute('Category')); $saleObj->setDate(new \DateTime(date('Y-m-d H:i:s', $saleItem->getAttribute('EDate')))); $saleObj->setEarnings($this->parseFloat($saleItem->getAttribute('Earnings'))); $saleObj->setLinkType($saleItem->getAttribute('LinkType')); $saleObj->setPrice($this->parseFloat($saleItem->getAttribute('Price'))); $saleObj->setQty((int) $saleItem->getAttribute('Qty')); $saleObj->setRate($this->parseFloat($saleItem->getAttribute('Rate'))); $saleObj->setRevenue($this->parseFloat($saleItem->getAttribute('Revenue'))); $saleObj->setAffiliateTag($tagEntity); $saleObj->setSeller($saleItem->getAttribute('Seller') ?: null); $saleObj->setTitle($saleItem->getAttribute('Title')); $sales[] = $saleObj; } return $sales; }
protected function getGeoIpData($ip) { $data = array(); return $data; $html = file_get_contents(sprintf('http://www.geoiptool.com/en/?IP=%s', $ip)); $crawler = new Crawler($html); $temp = $crawler->filterXPath('//table[@class="tbl_style"][3]')->html(); $temp = strip_tags($temp); $atemp = explode("\n", $temp); array_shift($atemp); $tdata = array(); $key = null; foreach ($atemp as $t) { if (preg_match('/:/', $t)) { $key = preg_replace('/:/', '', trim($t)); $key = preg_replace('/\\s+/', "_", $key); $key = strtolower($key); continue; } $tdata[$key][] = $t; } foreach ($tdata as $key => $val) { $val = trim(implode(' ', $val)); $data[$key] = (!empty($val) and $val !== '+' and $val !== '()') ? $val : null; } return $data; }
protected function doValidation(Response $response) { $crawler = new Crawler($response->getBody()); $actionNodes = $crawler->filterXPath('//form[//input[@type="password"]]'); $url = (string) $response->getUri(); foreach ($actionNodes as $node) { $action = $node->getAttribute('action'); if (strpos($action, 'https://') === 0) { continue; } $fullPath = $node->tagName; $parent = $node->parentNode; while ($parent = $parent->parentNode) { if (property_exists($parent, 'tagName')) { $fullPath = $parent->tagName . '/' . $fullPath; } else { break; } } if (in_array($fullPath, $this->knownIdentifier, true)) { continue; } $this->knownIdentifier[] = $fullPath; $this->assert(strpos($url, 'https://') !== false, 'Password is transferred insecure using HTTP.'); } }
/** * @inheritdoc */ protected function parse(Requests_Response $requests) { $crawler = new Crawler(); $crawler->addContent($requests->body); $r = $crawler->filterXPath('//*[@id="content"]/div/div[2]/div[1]/div[1]/ul/li'); $results = array(); /** @var DOMElement $el */ foreach ($r as $el) { $c = new Crawler(); $c->add($el); $tags = []; /** @var DOMElement $z */ foreach ($c->filter(".horizontal-separated-list li") as $z) { $tags[] = $z->textContent; } $result = new Result(); $result->setTitle(trim($c->filter(".details a")->text())); $result->setTags($tags); $relUrl = $c->filter(".details a")->attr("href"); $id = explode("--", explode("/", parse_url($relUrl)["path"])[2])[1]; $result->setId($this->getName() . "_" . intval($id)); $result->setUrl("http://www.anibis.ch/" . $relUrl); $result->setPrice($c->filter(".price")->text()); $result->setDescription($c->filter(".details .description")->text()); $results[] = $result; } return $results; }
/** * @param string $url The url to scrape. * @return \Slice\CliApp\ScrapeResults The results of the scrape task. */ public function getProductsForUrl($url) { //Grab the remote document contents $rawHTML = $this->downloader->download($url); //Drop it into a DOM crawler $crawler = new Crawler(); $crawler->addContent($rawHTML); try { //Use xPath to find all of the product li elements $productList = $crawler->filterXPath($this->productListXpath); } catch (\InvalidArgumentException $e) { //Convert into a Scrape Exception for easy handling by the command throw new ScrapeException($this->configValues['error_msg']['product_parse_error']); } //If there are none the page isn't supported if (sizeof($productList) == 0) { throw new ScrapeException($this->configValues['error_msg']['no_products']); } //Loop over each product li $productList->each(function ($liCrawler, $i) { try { //Find the product detail page url from the link $productURL = $liCrawler->filterXPath($this->pdpLinkXpath)->attr('href'); } catch (\InvalidArgumentException $e) { //Convert into a Scrape Exception for easy handling by the command throw new ScrapeException($this->configValues['error_msg']['product_parse_error']); } $product = $this->pdpParser->parseUrl($productURL); //Populate the final results container $this->results->addProduct($product); }); return $this->results; }
/** * @param $locator * @return Crawler */ protected function filterByXPath($locator) { if (!Locator::isXPath($locator)) { throw new MalformedLocator($locator, 'xpath'); } return $this->crawler->filterXPath($locator); }
/** * Filter the price present on each countries price page, and return the price * * @param string $content * * @return string $price */ public function filterPrice($content) { $crawler = new Crawler(); $crawler->addHtmlContent($content); $price = $crawler->filterXPath("html/body/div[1]/div[3]/div/div/div[3]/div[4]/div/table/tr[1]/td[2]")->extract('_text', 'td'); return trim($price[0]); }
protected function match($selector) { try { $selector = \Symfony\Component\CssSelector\CssSelector::toXPath($selector); } catch (\Symfony\Component\CssSelector\Exception\ParseException $e) { } return @$this->crawler->filterXPath($selector); }
public function find(Crawler $crawler) { try { return $crawler->filterXPath('//meta[@name="description"]')->attr('content'); } catch (\InvalidArgumentException $e) { return null; } }
public function extractAuthor(Crawler $crawler) { $ret = null; $crawler->filterXPath('//meta[@name="parsely-author"]')->each(function (Crawler $node) use(&$ret) { $ret = $node->attr('content'); }); return $ret; }
public function find(Crawler $crawler) { try { return $crawler->filterXPath('//meta[@property="og:type"]')->attr('content'); } catch (\InvalidArgumentException $e) { return null; } }
public function find(Crawler $crawler) { try { return $crawler->filterXPath('//img')->attr('src'); } catch (\InvalidArgumentException $e) { return null; } }
public function find(Crawler $crawler) { try { return trim($crawler->filterXPath('//head/title')->text()); } catch (\InvalidArgumentException $e) { return null; } }
public function testUnknownThemeFallbacksToDefault() { $var = 1; $this->dumper->setTheme('unknown'); $this->dumper->setFormat(Format\HtmlFormat::FORMAT_NAME); $html = $this->dumper->dump($var); $crawler = new Crawler($html); $this->assertEquals('Simple', $crawler->filterXPath('//input[@type="hidden"]')->attr('value')); }
/** * Execute the console command. * * @return mixed */ public function handle() { foreach (RssFeed::limit(1)->get() as $feed) { $this->crawler->addContent($feed->html); foreach ($this->pointers as $key => $value) { try { $data[$key] = $this->crawler->filterXPath($value)->text(); } catch (\Exception $e) { $data[$key] = null; } } $data['url'] = $feed->url; $organisation = $this->saveOrganisation($data); $location = $this->saveLocation(); $organisation->location()->save($location); $vacancy = Vacancy::firstOrNew(['ref' => $data['ref']]); } }
public function parse(Crawler $crawler) { $params = array(); $node = $crawler->filter('input[name="nabpop"]:checked'); if ($node->count()) { $params['type'] = trim($node->first()->attr('value')); } $node = $crawler->filter('select[name="kategorie"] option:selected'); if ($node->count()) { $params['categoryId'] = trim($node->first()->attr('value')); $params['category'] = trim($node->first()->text()); } $node = $crawler->filter('select[name="kraj"] option:selected'); if ($node->count()) { $params['regionId'] = trim($node->first()->attr('value')); $params['region'] = trim($node->first()->text()); } $node = $crawler->filter('input[name="nazev"]'); if ($node->count()) { $params['name'] = trim($node->first()->attr('value')); } $node = $crawler->filter('input[name="cena"]'); if ($node->count()) { $params['price'] = trim($node->first()->attr('value')); } $node = $crawler->filter('select[name="mena"] option:selected'); if ($node->count()) { $params['currency'] = trim($node->first()->attr('value')); } // email $node = $crawler->filterXPath('//*[@id = "bflm"]/following-sibling::script'); if ($node->count()) { $text = $node->text(); if (preg_match('/\\("bflm"\\)\\.value\\=\'(.*)\'(.*)\'(.*)\'\\;/', $text, $matches)) { $params['email'] = $matches[1] . '@' . $matches[3]; } } // telefon $node = $crawler->filter('input[name="telefon"]'); if ($node->count()) { $params['phone'] = trim($node->first()->attr('value')); } // telefon $node = $crawler->filter('textarea[name="prispevek"]'); if ($node->count()) { $params['text'] = trim($node->first()->text()); } // Pocet obrazku $as = $crawler->filter('.InzeratObrd a'); $imageUrls = array(); $as->each(function (Crawler $a) use(&$imageUrls) { $imageUrls[] = $a->attr('href'); }); $params['images'] = trim($as->count()); $params['imageUrls'] = $imageUrls; return $params; }
protected function getNode($html, $xpath) { $nodes = new Crawler($html); $filtered = $nodes->filterXPath($xpath); if ($filtered->count() === 0) { throw new Exception("Html does not contain `{$xpath}`.", Exception::NODE_NOT_FOUND); } return $filtered; }
/** * Scraps og:title off the page content * @param string $url * @return string */ public function scrap($url) { $title = 'Unable to parse'; $this->request->setMethod(HTTP_METH_GET); $this->request->setUrl($url); try { $response = $this->request->send(); $this->crawler->addHtmlContent($response->getBody()); $subCrawler = $this->crawler->filterXPath('//head/meta[@property="og:title"]'); $meta = $subCrawler->getNode(0); if ($meta) { $title = $meta->getAttribute('content'); } } catch (Exception $e) { $title = $e->getMessage(); } return $title; }
protected function process($content) { $parser = new Parser('yaml', 'markdown'); $frontmatter = $parser->parse($content); $crawler = new Crawler(); $crawler->addHtmlContent($frontmatter->getBody()); $title = ''; $crawler->filterXPath('//h1')->each(function (Crawler $crawler) use(&$title) { foreach ($crawler as $node) { if (!$title) { $title = $node->nodeValue; } $node->parentNode->removeChild($node); } }); $images = $crawler->filterXPath('//img'); foreach ($images as $image) { $src = $image->getAttribute('src'); $image->setAttribute('src', str_replace('../..', '', $src)); } $body = $crawler->html(); $intro = explode('<hr>', $body)[0]; $body = str_replace('<hr>', '', $body); return ['title' => $title ?: '', 'keywords' => isset($frontmatter->head['keywords']) ? $frontmatter->head['keywords'] : '', 'description' => isset($frontmatter->head['description']) ? $frontmatter->head['description'] : mb_substr(strip_tags(trim($intro)), 0, 150), 'intro' => trim($intro), 'cover' => isset($frontmatter->head['cover']) ? $frontmatter->head['cover'] : '', 'content' => trim($body), 'tags' => isset($frontmatter->head['tags']) ? $frontmatter->head['tags'] : '']; }
private function parse($html) { $crawler = new Crawler($html); $razonSocial = ucwords(strtolower(trim($crawler->filterXPath(self::XPATH_RAZON_SOCIAL)->text()))); $actividades = []; $crawler->filterXPath(self::XPATH_ACTIVITIES)->each(function (Crawler $node, $i) use(&$actividades) { if ($i > 0) { $actividades[] = ['giro' => $node->filterXPath('//td[1]/font')->text(), 'codigo' => (int) $node->filterXPath('//td[2]/font')->text(), 'categoria' => $node->filterXPath('//td[3]/font')->text(), 'afecta' => $node->filterXPath('//td[4]/font')->text() == 'Si']; } }); return ['razonSocial' => $razonSocial, 'actividades' => $actividades]; }