/** * getProducts() * Scrapes the products and returns them in a unified JSON format */ public function getProducts() { // Load HTML into scraper $scraper = new Crawler($this->source); // Get products $results = $scraper->filter('#productLister .product'); $products = []; foreach ($results as $product) { // Create new crawler for each product iteration $product = new Crawler($product); // Get title $title = trim($product->filter('h3')->first()->text()); // Get link $link = trim($product->filter('h3 a')->first()->attr('href')); // Get unit_price $unit_price = preg_replace('/[^\\d+\\.]/', '', $product->filter('.pricePerUnit')->first()->text()); try { // Get secondary resource Console::log('Following product link: ' . $link); $resource = $this->client->request('GET', $link); // Get size / Content-Length $size = number_format($resource->getHeader('Content-Length')[0] / 1024, 2); // Create tertiary crawler to get description $productDetails = new Crawler($resource->getBody()->getContents()); // Get description $description = trim($productDetails->filter('#information .productText')->first()->text()); } catch (RequestException $e) { Console::log('An error occurred while trying to fetch details for a product. I skipped this one for now. Error: ' . $e->getMessage()); continue; } // Collect item $products[] = ['title' => $title, 'unit_price' => $unit_price, 'size' => $size . 'kb', 'description' => $description]; } // Get total value and namespace the array $totalSum = array_sum(array_column($products, 'unit_price')); try { // Transform collection $products = $this->transformCollection($products); // Namespace collection and encode to JSON $json = $this->toJson(['results' => $products, 'totals' => $totalSum]); } catch (JsonException $e) { // JSON transformation failed, I'd usually also log this incident here with Monolog or similar Console::log('An error occurred while transforming the products into the appropriate JSON format: ' . $e->getMessage()); return false; } return $json; }
<?php // Mainly for PSR-4 auto-loading require_once dirname(__FILE__) . '/vendor/autoload.php'; // Initiate Scraper $scraper = new \Sainsbury\HtmlScraper(); $client = new \GuzzleHttp\Client(); // Logging \Sainsbury\Console::log('Getting initial source...'); $source = $client->request('GET', 'http://hiring-tests.s3-website-eu-west-1.amazonaws.com/2015_Developer_Scrape/5_products.html'); // Get Products $products = $scraper->setSource($source->getBody()->getContents())->getProducts(); // Show Products print_r($products);