public function process() { $crawler = $this->client->request($this->plan['method'], $this->plan['uri']); if (isset($this->plan['selector'])) { $selection = $crawler->filter($this->plan['selector']); } elseif (isset($this->plan['xpath'])) { $selection = $crawler->filterXPath($this->plan['path']); } if ($this->plan['images']) { $images = $selection->filterXPath('//img'); if (iterator_count($images) > 1) { foreach ($images as $image) { $crawler = new Crawler($image); $info = parse_url($this->plan['uri']); $url = $info['scheme'] . '://' . $info['host'] . '/' . $crawler->attr('src'); if (strpos($crawler->attr('src'), 'http') === 0) { $url = $info['scheme'] . '://' . $info['host'] . '/' . $this->plan['path'] . $crawler->attr('src'); } copy($url, SCRYPHP_STORAGE_PATH_IMG . DIRECTORY_SEPARATOR . substr(strrchr($url, "/"), 1)); } } } file_put_contents(SCRYPHP_STORAGE_PATH_TXT . DIRECTORY_SEPARATOR . time() . uniqid(time(), true) . '.txt', $selection->text()); return $selection->text(); }
/** * Determines if the given element has the attributes. * * @param \Symfony\Component\DomCrawler\Crawler $element * * @return bool */ protected function hasAttributes(Crawler $element) { foreach ($this->attributes as $name => $value) { if (is_numeric($name)) { if (is_null($element->attr($value))) { return false; } } else { if ($element->attr($name) != $value) { return false; } } } return true; }
public function parse(Crawler $node, $rules, array &$return = []) { if (is_string($rules)) { return ':attr' === $rules ? $node->attr($rules) : $node->filter($rules)->count() ? $node->filter($rules)->text() : null; } elseif (is_callable($rules)) { return $rules($node); } foreach ($rules as $key => &$rule) { switch ($key) { case 0 === strpos($key, '@'): $this->property($node, $key, $rule, $return); break; case ':first': return $this->parse($node->first(), $rule, $return); case ':next-sibling': return $this->parse($node->siblings()->first(), $rule, $return); case ':parent': return $this->parse($node->parents()->first(), $rule, $return); case ':each': return $this->each($node, $rule); case ':attr': return $node->attr($rule); default: return $this->parse($node->filter($key), $rule, $return); } } return $return; }
protected function looksLikeGroupTitle(Crawler $node) { $tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong', 'dt']; if (in_array($node->nodeName(), $tags)) { return true; } if (preg_match('/[#\\*\\-_=\\+]{2,}/', $node->text())) { return true; } if (':' === substr($node->text(), -1)) { return true; } if (false !== strpos($node->attr('class'), 'header')) { return true; } if (false !== strpos($node->attr('class'), 'title')) { return true; } return false; }
/** * @param Crawler $node * @param $url * @return string */ protected static function addFeed(Crawler $node, $url) { if (!$node->attr('href')) { return; } $uri = FeedLink::factory($node, $url)->getUri(); if (!static::isValidUrl($uri) || in_array($uri, static::$feeds) || static::$strict && !static::isValidFeed($uri)) { return; } static::$feeds[] = $uri; }
private function getProductsLinks(string $url) : array { $links = []; $html = file_get_contents($url); $crawler = new Crawler($html); $productsLinks = $crawler->filter('.ttle .g_statistic'); foreach ($productsLinks as $node) { $linkNode = new Crawler($node); $links[] = $linkNode->attr('href'); } return $links; }
/** * Metodo para capturar as cidades da uf * * @param $codUf * @throws Exception * @return array */ public static function getCidades($codUf = null) { $cidades = array(); $client = new Client(); $crawler = $client->request('GET', 'http://www.cidades.ibge.gov.br/xtras/uf.php?lang=&coduf=' . $codUf); $lis = $crawler->filter('#lista_municipios > li'); foreach ($lis as $li) { $li = new Crawler($li); $codigo = substr($li->attr('id'), 1); $cidades[] = ['codigo' => $codigo, 'nome' => $li->filter('a')->html()]; } return $cidades; }
/** * Transform a bid HTML node into a bid array * @param Crawler $node HTML bid node * @return array bid */ protected function processBid(Crawler $node) { $bid = array(); $bid['title'] = $node->attr('title'); $bid['price'] = $node->filter('.price')->count() ? trim($node->filter('.price')->text()) : null; $bid['url'] = $node->attr('href'); preg_match('@/\\w+/(?P<id>\\d+)\\.htm\\?ca=\\d+_s@', $bid['url'], $matches); $bid['bid_id'] = $matches['id']; $category = trim($node->filter('.category')->text()); $bid['is_pro'] = strstr($category, '(pro)') ? true : false; list($date, $time) = $node->filter('.date > div')->each(function ($node, $i) { return $node->text(); }); $bid['created_at'] = new LeboncoinDatetime($date, $time); $bid['picture'] = $node->filter('.image')->children()->count() ? $node->filter('.image-and-nb > img')->attr('src') : null; $placement = trim($node->filter('.placement')->text()); $placement = explode('/', $placement); $placement = array_map(function ($item) { return trim($item); }, $placement); $bid['placement'] = implode(' / ', $placement); return $bid; }
public function createPackages() { DB::table('stores')->truncate(); $response = $this->crawlerLink('https://play.google.com/store/apps?hl=en&gl=us'); $crawler = new Crawler($response); $links = $crawler->filter('body a.child-submenu-link'); $data = []; foreach ($links as $i => $link) { $temp = new Crawler($link); $data[$i] = 'https://play.google.com' . $temp->attr('href') . '/collection/topselling_free?hl=en&gl=us'; } foreach ($data as $item) { $packages = $this->googlePackageListFromPage($item); foreach ($packages as $package) { DB::table('stores')->insert([ 'name' => $package, 'status' => 'not' ]); } } }
public function __construct(Crawler $node) { $this->node = $node; $this->url = $node->attr('href'); }
/** * * @param \Bpi\Sdk\Document $document */ public function post(Document $document) { $document->request('POST', $this->crawler->attr('href'), $this->render()); }
/** * @param \Symfony\Component\DomCrawler\Crawler $node * @param array $name */ public function dontSeeNodeAttribute($node, $name) { $this->assertEquals('', $node->attr($name)); }
public function getKivee() { set_time_limit(10000); $base_url = 'http://www.kiveeshop.com/'; $client = new Client(); $products = []; $template_replaced = '{{page}}'; $template_urls = [$base_url . '4-tops', $base_url . '14-outwears', $base_url . '15-dresses', $base_url . '13-bottoms']; foreach ($template_urls as $key => $template_url) { for ($i = 1; $i <= 5; $i++) { $url = str_replace($template_replaced, $i, $template_url); $header_url = get_headers($url, 1); if ($header_url[0] == 'HTTP/1.0 404 Not Found') { break; } $crawler = $client->request('GET', $url); $products_crawler = $crawler->filter('.ajax_block_product a'); foreach ($products_crawler as $key => $product_crawler) { $node = new Crawler($product_crawler); $url_page_link = $node->attr('href'); $page_crawler = $client->request('GET', $url_page_link); $name = $page_crawler->filter('#ContentPlaceHolderBody_ContentPlaceHolderBreadCrumb_lblNamaProduk')->text(); $price = $page_crawler->filter('#lblHargaAwal')->text(); $sale_price = $page_crawler->filter('#lblHarga')->text(); $desc = $page_crawler->filter('#cssmenu ul li div')->text(); $images = []; $images_obj = $page_crawler->filter('.slider-relative img'); foreach ($images_obj as $image_obj) { $images[] = $image_obj->getAttribute('src'); } // header(s'Content-Type: application/json'); $product = new Product(); $product->title = $name; $product->price = filter_var($price, FILTER_SANITIZE_NUMBER_INT); $product->description = preg_replace('/\\s+/', ' ', $desc); $product->sale_price = filter_var($sale_price, FILTER_SANITIZE_NUMBER_INT); $product->url = $url_page_link; $product->save(); foreach ($images as $key => $image) { $photo = new ProductPhoto(); $photo->photo_url = str_replace('./', $base_url, $image); $photo->thumbnail_url = str_replace('./', $base_url, $image); $photo->product_id = $product->id; $photo->save(); } $products[] = ['name' => $name, 'price' => $price, 'images' => $images]; } $next_page = $crawler->filter('.pagination_next.disabled'); if (sizeof($next_page) < 1) { break; } } } }
/** * @param Crawler $node * @param string $currentUri */ public function __construct(Crawler $node, $currentUri) { parent::__construct($node->getNode(0), $currentUri); $this->type = strtolower(trim($node->attr('type'))); }
/** * Get return information (type, description) from a DOMNode * * @param \DOMNode $node The DOMNode to parse * * @return array First element is the type, second the description */ protected function getReturn(\DOMNode $node) { $crawler = new Crawler($node); $type = $crawler->attr('type'); $description = $this->getInner($node); return array($type, $description); }
/** * determine card type by itemtype or set default Thing type * * @param \Symfony\Component\DomCrawler\Crawler $node * @return string */ public function getCardTypeFromCrawler(\Symfony\Component\DomCrawler\Crawler $node) { $typeUrl = $node->attr('itemtype') ? $node->attr('itemtype') : 'Thing'; $typeSplit = explode('/', $typeUrl); return array_pop($typeSplit); }
/** * Returns the form action's absolute URL. * * @param \Symfony\Component\DomCrawler\Crawler $form * @return string * @throws \Codeception\Exception\TestRuntimeException if either the current * URL or the URI of the form's action can't be parsed */ protected function getFormUrl(Crawler $form) { $action = $form->attr('action'); return $this->getAbsoluteUrlFor($action); }
/** * Perform HTTP PUT for given URI * * @param \Bpi\Sdk\Document $document */ public function put(Document $document) { $document->request('PUT', $this->crawler->attr('href')); }
private static function parseTopicDetails($item) { $crawler = new Crawler($item); $topic = new Forum(); $topic->profile = new Profile(); $topic->setTime($crawler->filter('div[style="padding-left: 3px;"]')->text()); # message id. # Example: # <div class="forum_border_around" id="forumMsg30902219">...</div> $topic->setid(str_replace('forumMsg', '', $crawler->attr('id'))); # image url. # Example: # <img src="http://cdn.myanimelist.net/images/useravatars/1901304.jpg" vspace="2" border="0"> //Note: Some MAL users do not have any avatars in the forum! try { $topic->profile->setAvatarUrl($crawler->filter('img')->attr('src')); } catch (\InvalidArgumentException $e) { //do nothing } $details = explode("\n\t\t ", $crawler->filter('td[class="forum_boardrow2"]')->text()); $topic->setUsername($details[0]); $topic->profile->details->setForumPosts(str_replace('Posts: ', '', $details[6])); if ($details[1] == '') { $topic->profile->details->setAccessRank('Member'); } else { $topic->profile->details->setAccessRank($details[1]); } if ($topic->profile->details->getForumPosts() == '') { $topic->profile->details->setStatus($details[3]); $topic->profile->details->setJoinDate(str_replace('Joined: ', '', $details[4])); $topic->profile->details->setForumPosts(str_replace('Posts: ', '', $details[5])); } else { $topic->profile->details->setStatus($details[4]); $topic->profile->details->setJoinDate(str_replace('Joined: ', '', $details[5])); } //to force json array and !objects. $topic->profile->manga_stats = null; $topic->profile->anime_stats = null; # comment. # Example: # <div id="message25496275">...</div> $topic->setComment($crawler->filter('div[id="message' . $topic->getId() . '"]')->html()); return $topic; }
protected function extractDump($dump) { $crawler = new Crawler(); $crawler->addContent($dump); foreach ($crawler->filter('table') as $table) { $table_crawler = new Crawler(); $table_crawler->addNode($table); $tag_group_name = $table_crawler->attr('g1'); $tag_full_name = $table_crawler->attr('name'); $tag_g0 = $table_crawler->attr('g0'); $tag_g2 = $table_crawler->attr('g2'); $tags = $table_crawler->filter('tag'); foreach ($tags as $tag) { $tag_crawler = new Crawler(); $tag_crawler->addNode($tag); $extra = array(); if ($tag_crawler->attr('g0')) { $extra['local_g0'] = $tag_crawler->attr('g0'); } if ($tag_crawler->attr('g1') && !in_array($tag_crawler->attr('g1'), array('MakerNotes', 'Chapter#'))) { $g_name = $tag_crawler->attr('g1'); $extra['local_g1'] = $tag_crawler->attr('g1'); } else { $g_name = $tag_group_name; } if ($tag_crawler->attr('g2')) { $extra['local_g2'] = $tag_crawler->attr('g2'); } $flags = explode(',', $tag_crawler->attr('flags')); if (in_array('Avoid', $flags)) { $extra['flag_Avoid'] = 'true'; } if (in_array('Binary', $flags)) { $extra['flag_Binary'] = 'true'; } if (in_array('Permanent', $flags)) { $extra['flag_Permanent'] = 'true'; } if (in_array('Protected', $flags)) { $extra['flag_Protected'] = 'true'; } if (in_array('Unsafe', $flags)) { $extra['flag_Unsafe'] = 'true'; } if (in_array('List', $flags)) { $extra['flag_List'] = 'true'; } if (in_array('Mandatory', $flags)) { $extra['flag_Mandatory'] = 'true'; } if (in_array('Bag', $flags)) { $extra['flag_Bag'] = 'true'; } if (in_array('Seq', $flags)) { $extra['flag_Seq'] = 'true'; } if (in_array('Alt', $flags)) { $extra['flag_Alt'] = 'true'; } $subspace = str_replace('::', '\\', $g_name); $tag_name = $tag_crawler->attr('name'); $classname = self::generateClassname($tag_name); $tag_id = $tag_crawler->attr('id'); $properties = array_merge(array('Id' => $tag_id, 'Name' => $tag_name, 'FullName' => $tag_full_name, 'GroupName' => $g_name, 'g0' => $tag_g0, 'g1' => $tag_group_name, 'g2' => $tag_g2, 'Type' => $tag_crawler->attr('type'), 'Writable' => $tag_crawler->attr('writable'), 'Description' => $tag_crawler->filter('desc[lang="en"]')->first()->text()), $extra); if ($tag_crawler->attr('count')) { $properties['MaxLength'] = $tag_crawler->attr('count'); } $this->types[$tag_crawler->attr('type')] = $tag_crawler->attr('type'); if ($tag_crawler->attr('index')) { $properties['Index'] = $tag_crawler->attr('index'); } if (count($tag_crawler->filter('values')) > 0) { $values = array(); $values_tag = $tag_crawler->filter('values')->first(); $Keys = $values_tag->filter('key'); foreach ($Keys as $Key) { $KeyCrawler = new Crawler(); $KeyCrawler->addNode($Key); $Id = $KeyCrawler->attr('id'); $Label = $KeyCrawler->filter('val[lang="en"]')->first()->text(); $values[$Id] = array('Id' => $Id, 'Label' => $Label); } $properties['Values'] = $values; } $this->createTagClass($subspace, $classname, $properties); } } $this->generateTypes(); }
protected function getCategoryLink(Crawler $crawler) { return array('name' => $crawler->text(), 'href' => $this->baseUrl . $crawler->attr('href')); }
public function indexNewGames() { $crawler = new Crawler(); //--------------------extract all product details---------------------- global $products; $products = array(); for ($i = 1; $i < 2; $i++) { $url = 'http://www.ebay.com.my/sch/Consoles-/139971/i.html?LH_ItemCondition=1000|4000|5000|6000&_pgn=' . $i . '&_skc=200&rt=nc'; $html = file_get_contents($url); $crawler->addContent($html); //------------------filter category------------------------ $category = $crawler->filter('span.kwcat b')->text(); //print_r($category); //--------------------------------------------------------- //------------------filter condition----------------------- $condition = $crawler->filter('span.cbx')->text(); //print_r($condition); //--------------------------------------------------------- //echo "<br><br><strong>Page</strong>" . $i . " > " . $url . "<br><br>"; $crawler->filter('ul#ListViewInner')->each(function ($crawler) { global $products; global $rank; $rank = $crawler->filter('h3.lvtitle a')->each(function ($crawler, $i) use(&$products) { $products[$i]['title'] = $crawler->text(); $products[$i]['url'] = $crawler->attr('href'); }); $rank = $crawler->filter('ul.lvprices.left.space-zero')->each(function ($crawler, $i) use(&$products) { $toReplace = array('RM', ','); $with = array('', ''); $products[$i]['price'] = str_replace($toReplace, $with, $crawler->filter('li.lvprice.prc')->last()->text()); }); $rank = $crawler->filter('a.img.imgWr2 img')->each(function ($crawler, $i) use(&$products) { $products[$i]['image'] = $crawler->attr('src'); }); $rank = $crawler->filter('span.ship')->each(function ($crawler, $i) use(&$products) { $products[$i]['shipping'] = $crawler->text(); }); ++$rank; }); //dd($products); } //-------------insert data using model-------------- foreach ($products as $pro) { $product = new Products(); if ($category == 'Consoles') { $product->category_id = 6; } if ($condition == 'Brand New(selected)') { $product->condition_id = 1; } $arrProduct = explode(' ', $pro['title']); $brands = \DB::table('brand')->whereIn('brand_title', $arrProduct)->get(); if ($brands) { foreach ($brands as $brand) { $product->brand_id = $brand->id; } } else { $product->brand_id = 204; } $product->product_name = $pro['title']; $product->shopper_link = $pro['url']; $product->product_price = $pro['price']; $product->picture_link = $pro['image']; $product->product_shipping = $pro['shipping']; $product->save(); } //--------------------------------------------------- }
public function getPageLinkCollection(Crawler $crawler) { return $crawler->filter($this->profileLinkSelector)->each(function (Crawler $crawler) { return $crawler->attr('href'); }); }
/** * @param \Symfony\Component\DomCrawler\Crawler $form * * @return string */ protected function getFormUrl($form) { $action = $form->attr('action'); $currentUrl = $this->client->getHistory()->current()->getUri(); if (empty($action) || $action === '#') { return $currentUrl; } $build = parse_url($currentUrl); if ($build === false) { throw new TestRuntime("URL '{$currentUrl}' is malformed"); } $uriParts = parse_url($action); if ($uriParts === false) { throw new TestRuntime("URI '{$action}' is malformed"); } foreach ($uriParts as $part => $value) { if ($part === 'path' && strpos($value, '/') !== 0 && !empty($build[$part])) { // if it ends with a slash, relative paths are below it if (preg_match('~/$~', $build[$part])) { $build[$part] = $build[$part] . $value; continue; } $build[$part] = dirname($build[$part]) . '/' . $value; continue; } $build[$part] = $value; } return \GuzzleHttp\Url::buildUrl($build); }
/** * process image. * * @since 0.9.0 * * @param \Symfony\Component\DomCrawler\Crawler $node * * @return \stdClass * * @author nguyenvanduocit */ public function process_Img(Crawler &$node) { $newNode = new \stdClass(); $newNode->nodeName = $node->nodeName(); $newNode->src = $node->attr('src'); return $newNode; }
/** * @param Crawler $element * @param $attributeName * @return null|string */ protected function getAttributeValue(Crawler $element, $attributeName) { $attributeValue = $element->attr($attributeName); return is_null($attributeValue) ? "" : trim($attributeValue); }
/** * @param \Symfony\Component\DomCrawler\Crawler $form * * @return string */ protected function getFormUrl($form) { $action = $form->attr('action'); if (!$action or $action == '#') { $action = $this->client->getHistory()->current()->getUri(); } return $action; }
/** * @param \Symfony\Component\DomCrawler\Crawler $form * * @return string */ protected function getFormUrl($form) { $action = $form->attr('action'); $currentUrl = $this->client->getHistory()->current()->getUri(); // empty url if (!$action or $action == '#') { $action = $currentUrl; } // relative url if (strpos($action, '/') !== 0 and !preg_match('~^https?://~', $action)) { $path = pathinfo($currentUrl); $action = $path['dirname'] . '/' . $action; } return $action; }
/** * Get the checked value from a radio group. * * @param \Symfony\Component\DomCrawler\Crawler $radioGroup * @return string|null * * @throws \Exception */ protected function getCheckedValueFromRadioGroup(Crawler $radioGroup) { if ($radioGroup->nodeName() !== 'input' || $radioGroup->attr('type') !== 'radio') { throw new Exception('Given element is not a radio button.'); } foreach ($radioGroup as $radio) { if ($radio->hasAttribute('checked')) { return $radio->getAttribute('value'); } } return; }
private function getPageLinkCollection(Crawler $crawler) { return $crawler->filter('.b-poster-detail .b-poster-detail__link')->each(function (Crawler $crawler) { return $crawler->attr('href'); }); }