Exemplo n.º 1
0
 /**
  * Given a source (URL), retrieve raw data
  * and turn it into an array of arrays of product data.
  *
  * Each sub array should contain the following keys:
  *  - title (string)     - the title of the product
  *  - size (int)         - bytes of raw data
  *  - unit_price (float) - the price of the product
  *  - description        - description of the product
  *
  * @param string $source Information about where to load data from.
  *
  * @throws InvalidDataSourceException If the source is invalid.
  * @throws MalformedDataException     If the data is malformed.
  *
  * @return array
  */
 public function getProductData($source)
 {
     // We will save errors to an array.
     libxml_use_internal_errors(true);
     if (false === $this->looksLikeUrl($source)) {
         throw new InvalidDataSourceException("This isn't a URL.");
     }
     // Create a DOM parser & load the HTML.
     $dom = new DOMParser();
     $listHtml = $this->scraper->getHtml($source);
     $return = array();
     $dom->loadHTML($listHtml);
     foreach ($dom->querySelectorAll("div.product") as $product) {
         // Get the title element & price element
         $titleElement = $product->querySelector("div.productInfo")->querySelector("a");
         $priceElement = $product->querySelector("p.pricePerUnit");
         // Load the product-specific page.
         $productUrl = $titleElement->getAttributeNode("href")->textContent;
         $productHtml = $this->scraper->getHtml($productUrl);
         $productDom = new DOMParser();
         $productDom->loadHTML($productHtml);
         // Get the product description.
         // Note: this does not have a unique ID, this is very volatile.
         $descriptionElement = $productDom->querySelector("div#information")->querySelector("div.productText");
         // Set up the product data.
         $productData = array();
         $productData['title'] = trim($titleElement->textContent);
         $productData['description'] = trim($descriptionElement->textContent);
         $productData['size'] = strlen($productHtml);
         $productData['unit_price'] = (double) preg_replace('/[^0-9.]/', '', $priceElement->textContent);
         $return[] = $productData;
     }
     // Put errors in $this->errors.
     $this->errors = libxml_get_errors();
     libxml_clear_errors();
     return $return;
 }