/** * Given a source (URL), retrieve raw data * and turn it into an array of arrays of product data. * * Each sub array should contain the following keys: * - title (string) - the title of the product * - size (int) - bytes of raw data * - unit_price (float) - the price of the product * - description - description of the product * * @param string $source Information about where to load data from. * * @throws InvalidDataSourceException If the source is invalid. * @throws MalformedDataException If the data is malformed. * * @return array */ public function getProductData($source) { // We will save errors to an array. libxml_use_internal_errors(true); if (false === $this->looksLikeUrl($source)) { throw new InvalidDataSourceException("This isn't a URL."); } // Create a DOM parser & load the HTML. $dom = new DOMParser(); $listHtml = $this->scraper->getHtml($source); $return = array(); $dom->loadHTML($listHtml); foreach ($dom->querySelectorAll("div.product") as $product) { // Get the title element & price element $titleElement = $product->querySelector("div.productInfo")->querySelector("a"); $priceElement = $product->querySelector("p.pricePerUnit"); // Load the product-specific page. $productUrl = $titleElement->getAttributeNode("href")->textContent; $productHtml = $this->scraper->getHtml($productUrl); $productDom = new DOMParser(); $productDom->loadHTML($productHtml); // Get the product description. // Note: this does not have a unique ID, this is very volatile. $descriptionElement = $productDom->querySelector("div#information")->querySelector("div.productText"); // Set up the product data. $productData = array(); $productData['title'] = trim($titleElement->textContent); $productData['description'] = trim($descriptionElement->textContent); $productData['size'] = strlen($productHtml); $productData['unit_price'] = (double) preg_replace('/[^0-9.]/', '', $priceElement->textContent); $return[] = $productData; } // Put errors in $this->errors. $this->errors = libxml_get_errors(); libxml_clear_errors(); return $return; }