public static function parse($contents, $id, $type) { $crawler = new Crawler(); $crawler->addHTMLContent($contents, 'UTF-8'); $rows = $crawler->filter('div[class="spaceit_pad"]'); $title = preg_replace('/ (\\w+?) Details/', '$2', $crawler->filter('div[class="normal_header"]')->text()); $result = array(); if ($type === 'anime') { foreach ($rows as $historyItem) { $crawler = new Crawler($historyItem); $date = explode(' ', $crawler->text()); $historyinfo['item'] = new Anime(); $historyinfo['item']->setId((int) $id); $historyinfo['item']->setTitle($title); $historyinfo['item']->setWatchedEpisodes((int) $date[1]); $historyinfo['type'] = $type; $historyinfo['time_updated'] = Date::formatTime($date[4] . ' ' . $date[6]); $result[] = $historyinfo; } } else { foreach ($rows as $historyItem) { $crawler = new Crawler($historyItem); $date = explode(' ', $crawler->text()); $historyinfo['item'] = new Manga(); $historyinfo['item']->setId((int) $id); $historyinfo['item']->setTitle($title); $historyinfo['item']->setChaptersRead((int) $date[1]); $historyinfo['type'] = $type; $historyinfo['time_updated'] = Date::formatTime($date[4] . ' ' . $date[6]); $result[] = $historyinfo; } } return $result; }
public static function parseDay($rows) { $result = array(); foreach ($rows as $item) { $crawler = new Crawler($item); $anime = new Anime(); $url = $crawler->filter('a[class="link-title"]')->attr('href'); $id = preg_match('/\\/(anime|manga)\\/(\\d+)\\/.*?/', $url, $urlParts); if ($id !== false || $id !== 0) { $anime->setId((int) $urlParts[2]); } $anime->setTitle(trim($crawler->filter('a[class="link-title"]')->text())); $producer = $crawler->filter('span[class="producer"] a'); if ($producer->count() > 0) { $anime->setProducers(explode(', ', $crawler->filter('span[class="producer"] a')->text())); } $anime->setEpisodes((int) str_replace(' eps', '', $crawler->filter('div[class="eps"] span')->text())); $genres = $crawler->filter('div[class="genres-inner js-genre-inner"] a'); $genreArray = array(); foreach ($genres as $genre) { $genreCrawler = new Crawler($genre); $genreArray[] = $genreCrawler->text(); } $anime->setGenres($genreArray); $anime->setImageUrl($crawler->filter('div[class="image lazyload"]')->attr('data-bg')); $anime->setSynopsis(trim($crawler->filter('div[class="synopsis js-synopsis"]')->text())); $detail = explode('-', $crawler->filter('div[class="info"]')->text()); $anime->setType(trim($detail[0])); $anime->setMembersCount((int) str_replace(',', '', trim($crawler->filter('span[class="member fl-r"]')->text()))); $anime->setMembersScore((double) trim($crawler->filter('span[class="score"]')->text())); $result[] = $anime; } return $result; }
/** * @param Crawler $crawler * @return string */ private function parseTeachersName(Crawler $crawler) { $str = $crawler->text(); $str = str_replace('Vyučující: ', '', $str); $str = str_replace('(email)', '', $str); return trim($str); }
/** * Parse text from crawler node. * * @param \Symfony\Component\DomCrawler\Crawler $node * @param mixed $default * * @return mixed */ protected function parseTextFromNode($node, $default = null) { $text = trim($node->text()); if (strlen($text) === 0) { return $default; } return $text; }
/** * Нормализует значение цены на блюдо * * @param Crawler|null $element * @return float */ public function normalisePrice(Crawler $element = null) { if ($element === null) { return 0.0; } $price = (string) $element->text(); return (double) str_replace(',', '.', $price); }
private function getPageData($html) { $crawler = new Crawler($html); $body = $crawler->filter('body'); $header = $body->filter('.page-header'); $breadcrumbs = $header->filter('.breadcrumb li span a')->each(function (Crawler $crawler) { return trim($crawler->text()); }); $descriptions = $body->filter('.container > .row .profile')->each(function (Crawler $crawler) { return trim($crawler->text()); }); $default = array_fill(0, 3, ''); $descriptions = $descriptions + $default; $breadcrumbs = $breadcrumbs + $default; $data = ['city' => $breadcrumbs[1], 'category' => $breadcrumbs[2], 'title' => trim($header->filter('h1')->text()), 'salary' => substr(trim($header->filter('.profile-details-salary')->text()), 1), 'experience_year' => (int) $body->filter('.before-hint')->text(), 'experience_description' => $descriptions[0], 'achievement' => $descriptions[1], 'expect' => $descriptions[2], 'skills' => json_encode($this->getSkills($body))]; return $data; }
protected function extract(Crawler $crawler) { // mother f****n callback n***a return $crawler->filter('table.ob_gBody tr')->each(function (Crawler $crawler, $i) { return $crawler->filter('.ob_gCc2')->each(function (Crawler $crawler, $i) { return $crawler->text(); }); }); }
/** * @param Crawler $crawler * @return Person */ public static function parseFromDomCrawler(Crawler $crawler) { $content = $crawler->text(); $content = StringHelper::removeEmptyLines($content); $contentItems = explode("\n", $content); $contentItems = array_map('trim', $contentItems); $name = trim(explode(',', $contentItems[1])[0]); $birthday = DateTimeParser::parseFromCzechDateString($contentItems[2]); return new Person($name, $birthday, $contentItems[3]); }
private function getPageTags(Crawler $crawler) { return $crawler->filter('.item-info table tr')->each(function (Crawler $crawler) { $name = trim($crawler->filter('td')->first()->text()); $list = $crawler->filter('.tag span')->each(function (Crawler $crawler) { return $crawler->text(); }); return compact('name', 'list'); }); }
protected function looksLikeGroupTitle(Crawler $node) { $tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong', 'dt']; if (in_array($node->nodeName(), $tags)) { return true; } if (preg_match('/[#\\*\\-_=\\+]{2,}/', $node->text())) { return true; } if (':' === substr($node->text(), -1)) { return true; } if (false !== strpos($node->attr('class'), 'header')) { return true; } if (false !== strpos($node->attr('class'), 'title')) { return true; } return false; }
public function populateLeagues($crawler) { $types = []; foreach ($crawler->filterXPath('//*[@class="ismSecondary"]/*[@class="ismTableHeading"]') as $h) { $c = new Crawler($h); if (preg_match('~ leagues$~', $c->text())) { $types[] = strtolower(preg_replace('~ leagues$~', '', $c->text())); } } foreach ($crawler->filterXPath('//*[@class="ismTable ismLeagueTable"]') as $i => $t) { $table = new Crawler($t); $leagues = $table->filterXpath('//tbody/tr'); foreach ($leagues as $l) { $c = new Crawler($l); $league = new UserLeague(); $league->type = $types[$i]; $league->populate($c); $this->leagues[] = $league; } } }
public function testMessageRender() { $message = new FlashMessage(); $message->title = 'Error!'; $message->type = FlashMessage::TYPE_ERROR; $message->message = 'This is a test error'; Yii::$app->session->setFlash('test', $message); $html = FlashMessages::widget(['id' => 'flash-test']); $dom = new Crawler($html); $this->assertContains('This is a test error', $dom->text()); $this->assertContains('Error!', $dom->filter('h4')->text()); }
private function addToCart($mealName, $quantity, Crawler $crawler, Client $client) { $titles = $crawler->filter('h4')->reduce(function ($crawler) use($mealName) { return false !== strpos($crawler->text(), $mealName); }); if (count($titles) !== 1) { throw new \RuntimeException(sprintf('Expected 1 title containing "%s", found %s.', $mealName, count($titles))); } $link = $titles->eq(0)->parents()->first()->filter('input[data-meal]'); $mealId = $link->attr('data-meal'); $client->request('POST', '/cart', array('meal' => $mealId, 'mode' => 'add', 'quantity' => $quantity)); $this->assertEquals(200, $client->getResponse()->getStatusCode()); }
private function each(Crawler $node, $rule) { $elements = []; // '@idioms' => [':each' => '#relatedentries > dl > dd > ul > li > a > .arl8'] if (is_string($rule)) { $node->filter($rule)->each(function (Crawler $node) use($elements) { $value = $node->text(); if (null !== $value) { $elements[] = $value; } }); } else { foreach ($rule as $filter => $pattern) { $node->filter($filter)->each(function (Crawler $node, $i) use(&$elements, $pattern) { $elements[$i] = []; $this->parse($node, $pattern, $elements[$i]); }); } } return $elements; }
/** * Aucune annotation Route n'a été définie pour cette méthode de controller, * la page ne sera pas directement accessible. * * La méthode peut être appelée depuis un fichier de template * {{ render(controller('MmiBlogBundle:Feed:last', {limit: 5})) }} */ public function lastAction($limit) { //import d'un flux RSS try { $reader = Reader::import('http://etin.yourphototravel.com/fr/etins.rss'); } catch (\Exception $e) { $reader = array(); } //lecture du flux RSS et préparation des paramètres à envoyer à la vue $items = array(); $count = 0; foreach ($reader as $item) { $crawler = new Crawler($item->getDescription()); $items[] = array('title' => $item->getTitle(), 'link' => $item->getLink(), 'image' => $crawler->filter('img')->first()->attr('src'), 'description' => utf8_decode($crawler->text())); $count++; if ($count == $limit) { break; } } //création de la réponse return $this->get('templating')->renderResponse('MmiBlogBundle:Feed:list.html.twig', array('items' => $items)); }
/** * Transform a bid HTML node into a bid array * @param Crawler $node HTML bid node * @return array bid */ protected function processBid(Crawler $node) { $bid = array(); $bid['title'] = $node->attr('title'); $bid['price'] = $node->filter('.price')->count() ? trim($node->filter('.price')->text()) : null; $bid['url'] = $node->attr('href'); preg_match('@/\\w+/(?P<id>\\d+)\\.htm\\?ca=\\d+_s@', $bid['url'], $matches); $bid['bid_id'] = $matches['id']; $category = trim($node->filter('.category')->text()); $bid['is_pro'] = strstr($category, '(pro)') ? true : false; list($date, $time) = $node->filter('.date > div')->each(function ($node, $i) { return $node->text(); }); $bid['created_at'] = new LeboncoinDatetime($date, $time); $bid['picture'] = $node->filter('.image')->children()->count() ? $node->filter('.image-and-nb > img')->attr('src') : null; $placement = trim($node->filter('.placement')->text()); $placement = explode('/', $placement); $placement = array_map(function ($item) { return trim($item); }, $placement); $bid['placement'] = implode(' / ', $placement); return $bid; }
/** * @param Crawler $new_sibling * @param array $existing_siblings * @return bool */ public function textExists(Crawler $new_sibling, array $existing_siblings) { $new_sibling_text = $new_sibling->text(); foreach ($existing_siblings as $existing_sibling) { if ($new_sibling_text == $existing_sibling->text()) { return true; } } return false; }
protected function getCategoryLink(Crawler $crawler) { return array('name' => $crawler->text(), 'href' => $this->baseUrl . $crawler->attr('href')); }
/** * Parses showtime infos from a individual div. * * @param Crawler $timeSpan * @return ShowtimeInfo */ private function parseShowtime(Crawler $timeSpan) { $showtime = new ShowtimeInfo(); $texts = explode(' ', str_replace(' ', '', $timeSpan->text())); if ($this->getTime($texts[0]) == null) { $showtime->setInfo($texts[0]); } $times = []; foreach ($texts as $text) { $time = trim(html_entity_decode($text)); $time = $this->getTime($time); if (!empty($time)) { $times[] = $time; } } $showtime->setTimes(array_unique($times)); return $showtime; }
private function getPaginationInfo(Crawler $info) { $result = new \stdClass(); switch ($info->count()) { case 0: // no results $result->total_results = 0; $result->results_per_page = 10; // (last item - first) + 1 => Items 21 -> 40 => 40-21+1 = 20 items. $result->num_pages = 0; $result->first_on_page = 0; $result->last_on_page = 0; $result->current_page = 0; break; case 1: $aux = explode(PHP_EOL, $info->text()); $info = array_pop($aux); $info = explode(' ', $info); $result->total_results = intval($info[4]); $result->results_per_page = intval($info[2]) - intval($info[0]) + 1; // (last item - first) + 1 => Items 21 -> 40 => 40-21+1 = 20 items. $result->num_pages = intval(ceil($result->total_results / $result->results_per_page)); $result->first_on_page = intval($info[0]); $result->last_on_page = intval($info[2]); $result->current_page = intval(ceil($result->first_on_page / $result->results_per_page)); break; } return $result; }
/** * @param \Symfony\Component\DomCrawler\Crawler $node */ public function seeNodeEmpty($node) { $this->assertEquals('', $node->text()); }
/** * Get the price value from a pricePerUnit node * * @param Crawler $node * @return string */ protected function getPrice(Crawler $node) { $children = ''; foreach ($node->children() as $child) { $children .= $child->nodeValue; } $text = $node->text(); $price = mb_substr($text, 0, mb_strpos($text, $children)); // remove /unit $price = trim($price, "£ \t\n\r\v"); // remove whitespace and pound @TODO handle different currencies return $price; }
public function indexGames() { $crawler = new Crawler(); //--------------------extract all product details---------------------- global $products; $products = array(); $html = file_get_contents("http://www.lelong.com.my/toys-and-games/game-console/"); $crawler->addContent($html); //------------------extract retailer logo------------------------ //$retailer_logo = $crawler->filter('div#top1Logo img')->attr('src'); //--------------------------------------------------------------- //---------------------------filter category------------------------------- $category = $crawler->filter('a[href="/toys-and-games/game-console/"]')->text(); //------------------------------------------------------------------------- $crawler->filter('div.item4inline')->each(function ($crawler) { for ($i = 2; $i <= 5;) { $url = 'http://www.lelong.com.my/toys-and-games/game-console/?D=' . $i; $html = file_get_contents($url); $crawler->addContent($html); global $products; global $rank; $rank = $crawler->filter('span.catalogTitle')->each(function ($crawler, $i) use(&$products) { $products[$i]['title'] = $crawler->text(); $products[$i]['url'] = str_replace('//', '', $crawler->parents()->attr('href')); }); $rank = $crawler->filter('div.catalogPrice b')->each(function ($crawler, $i) use(&$products) { $toReplace = array('RM', ','); $with = array('', ''); $products[$i]['price'] = str_replace($toReplace, $with, $crawler->text()); }); $rank = $crawler->filter('div.catalog-wrap')->each(function ($crawler, $i) use(&$products) { $products[$i]['image'] = $crawler->parents()->attr('id'); }); $rank = $crawler->filter('div.catalogIcon')->each(function ($crawler, $i) use(&$products) { $products[$i]['shipping'] = $crawler->children()->text(); }); ++$rank; $i++; //print_r($products); } }); //--------------insert data using model----------------- foreach ($products as $pro) { $product = new Products(); if ($category == 'Game Console') { $product->category_id = 6; $product->condition_id = 3; } $arrProduct = explode(' ', $pro['title']); $brands = \DB::table('brand')->whereIn('brand_title', $arrProduct)->get(); if ($brands) { foreach ($brands as $brand) { $product->brand_id = $brand->id; } } else { $product->brand_id = 204; } $product->product_name = $pro['title']; $product->shopper_link = $pro['url']; $product->product_price = $pro['price']; $product->picture_link = $pro['image']; $product->product_shipping = $pro['shipping']; $product->save(); } //------------------------------------------------------- }
/** * @param Crawler $node * * @return string */ protected function parse(Crawler $node) { return $node->text(); }
public function indexNewGames() { $crawler = new Crawler(); //--------------------extract all product details---------------------- global $products; $products = array(); for ($i = 1; $i < 2; $i++) { $url = 'http://www.ebay.com.my/sch/Consoles-/139971/i.html?LH_ItemCondition=1000|4000|5000|6000&_pgn=' . $i . '&_skc=200&rt=nc'; $html = file_get_contents($url); $crawler->addContent($html); //------------------filter category------------------------ $category = $crawler->filter('span.kwcat b')->text(); //print_r($category); //--------------------------------------------------------- //------------------filter condition----------------------- $condition = $crawler->filter('span.cbx')->text(); //print_r($condition); //--------------------------------------------------------- //echo "<br><br><strong>Page</strong>" . $i . " > " . $url . "<br><br>"; $crawler->filter('ul#ListViewInner')->each(function ($crawler) { global $products; global $rank; $rank = $crawler->filter('h3.lvtitle a')->each(function ($crawler, $i) use(&$products) { $products[$i]['title'] = $crawler->text(); $products[$i]['url'] = $crawler->attr('href'); }); $rank = $crawler->filter('ul.lvprices.left.space-zero')->each(function ($crawler, $i) use(&$products) { $toReplace = array('RM', ','); $with = array('', ''); $products[$i]['price'] = str_replace($toReplace, $with, $crawler->filter('li.lvprice.prc')->last()->text()); }); $rank = $crawler->filter('a.img.imgWr2 img')->each(function ($crawler, $i) use(&$products) { $products[$i]['image'] = $crawler->attr('src'); }); $rank = $crawler->filter('span.ship')->each(function ($crawler, $i) use(&$products) { $products[$i]['shipping'] = $crawler->text(); }); ++$rank; }); //dd($products); } //-------------insert data using model-------------- foreach ($products as $pro) { $product = new Products(); if ($category == 'Consoles') { $product->category_id = 6; } if ($condition == 'Brand New(selected)') { $product->condition_id = 1; } $arrProduct = explode(' ', $pro['title']); $brands = \DB::table('brand')->whereIn('brand_title', $arrProduct)->get(); if ($brands) { foreach ($brands as $brand) { $product->brand_id = $brand->id; } } else { $product->brand_id = 204; } $product->product_name = $pro['title']; $product->shopper_link = $pro['url']; $product->product_price = $pro['price']; $product->picture_link = $pro['image']; $product->product_shipping = $pro['shipping']; $product->save(); } //--------------------------------------------------- }
/** * @param string $field * @param Crawler $node * * @return string */ protected function extractText($field, Crawler $node) { return $node->text(); }
/** * Process general node. * * @since 0.9.0 * * @param \Symfony\Component\DomCrawler\Crawler $node * * @return \stdClass * * @author nguyenvanduocit */ public function process_General(Crawler &$node) { $newNode = new \stdClass(); $newNode->nodeName = $node->nodeName(); $newNode->text = $node->text(); return $newNode; }
/** * Extract required information. For documentation, follow * http://symfony.com/doc/current/components/dom_crawler.html#accessing-node-values * * @param \Symfony\Component\DomCrawler\Crawler $crawler * * @return void */ protected function parse(Crawler $crawler) { $crawler->filter('hx')->each(function (Crawler $crawler) { echo 'RANK:: ' . $crawler->text() . PHP_EOL; }); }
/** * @Then the email body should not contain :text */ public function theEmailBodyShouldNotContainText($text) { if (null === $this->message) { throw new \RuntimeException('Select an email which has to have been sent first. ' . 'You can use the step: "an email with subject :subject should have been sent (to :email)"'); } $crawler = new Crawler($this->message->getBody()); Assert::assertNotContains($text, $crawler->text()); }
public function indexGames() { $crawler = new Crawler(); //-------------extract all product details------------------ global $products; $products = array(); $html = file_get_contents("http://www.mudah.my/Malaysia/Games-and-Consoles-for-sale-3120?lst=0&fs=1&cg=3120&w=3&so=1&st=s"); $crawler->addContent($html); //------------------filter category------------------------ $category = $crawler->filter('a[title="See all ads in Games & Consoles category"] span')->text(); //---------------------------------------------------------- $crawler->filter('div.listing_thumbs')->each(function ($crawler) { for ($i = 2; $i <= 5;) { $url = 'http://www.mudah.my/Malaysia/Games-and-Consoles-for-sale-3120?o=' . $i . '&q=&so=1&th=1'; $html = file_get_contents($url); $crawler->addContent($html); global $rank; global $products; $rank = $crawler->filter('h2.list_title.truncate a')->each(function ($crawler, $i) use(&$products) { $products[$i]['title'] = $crawler->text(); $products[$i]['url'] = $crawler->attr('href'); }); $rank = $crawler->filter('div.ads_price')->each(function ($crawler, $i) use(&$products) { $toReplace = array(' ', 'RM'); $with = array('', ''); $products[$i]['price'] = str_replace($toReplace, $with, $crawler->text()); }); $rank = $crawler->filter('li.listing_thumbs_image img')->each(function ($crawler, $i) use(&$products) { $products[$i]['image'] = $crawler->attr('src'); }); $rank = $crawler->filter('div.location')->each(function ($crawler, $i) use(&$products) { $products[$i]['location'] = str_replace(' ', '', $crawler->children()->siblings()->text()); }); $rank = $crawler->filter('div[title="Condition"]')->each(function ($crawler, $i) use(&$products) { $products[$i]['condition'] = $crawler->text(); }); ++$rank; $i++; //print_r($products); } }); //--------------insert data using model----------------- foreach ($products as $pro) { $product = new Products(); if ($category == 'Games & Consoles') { $product->category_id = 6; } $arrProduct = explode(' ', $pro['title']); $brands = \DB::table('brand')->whereIn('brand_title', $arrProduct)->get(); if ($brands) { foreach ($brands as $brand) { $product->brand_id = $brand->id; } } else { $product->brand_id = 1; } $product->product_name = $pro['title']; $product->shopper_link = $pro['url']; $product->product_price = $pro['price']; $product->picture_link = str_replace('thumbs', 'images', $pro['image']); $product->product_location = $pro['location']; if ($pro['condition'] == 'New') { $product->condition_id = 1; } else { $product->condition_id = 2; } $product->save(); } //-------------------------------------------------------- return "<div class='alert alert-success'>Successfully crawler site</div>"; }