Example #1
0
 /**
  * A front controller for this bespoke web scraper
  * @param $url The URL of the product list page to scrape products from
  */
 public function execute($url)
 {
     $retriever = new HtmlRetriever();
     $html = $retriever->retrieveHtml($url);
     try {
         $xpath = $this->createDOMXPathObjectFromHtml($html);
     } catch (RuntimeException $e) {
         print $e->getMessage();
         die;
     }
     $listPageParser = new ProductListPageParser($xpath);
     $products = $listPageParser->getProducts();
     foreach ($products as $product) {
         // NB: This I/O can be made non-blocking in PHP, but no need to do so here
         $html = $retriever->retrieveHtml($product->link);
         // Here I'm just taking the HTML size to mean the length of the html string in Kb
         // mb_strln with encoding set to '8bit' gives the byte length back as an integer
         $product->size = sprintf("%0.1fKb", mb_strlen($html, '8bit') / 1024);
         try {
             $xpath = $this->createDOMXPathObjectFromHtml($html);
         } catch (RuntimeException $e) {
             print $e->getMessage();
             die;
         }
         $singlePageParser = new SingleProductPageParser($xpath);
         $product->description = $singlePageParser->getDescription();
         unset($product->link);
     }
     $output = new stdClass();
     $output->results = $products;
     $output->total = $this->calculateTotal($output->results);
     echo json_encode($output, JSON_PRETTY_PRINT);
 }
 public function testProductIsExtractedCorrectly()
 {
     $p = new ProductListPageParser($this->single_product_document_xpath);
     $product = $p->getProducts()[0];
     $this->assertEquals("http://www.sainsburys.co.uk/shop/gb/groceries/ripe---ready/sainsburys-avocado-xl-pinkerton-loose-300g", $product->link);
     $this->assertEquals("Sainsbury's Avocado Ripe & Ready XL Loose 300g", $product->title);
     $this->assertEquals(1.5, $product->unit_price);
 }