/** * Test getting product data from valid HTML. * * Note: This is testing that HTML is parsed correctly. * Testing the actual scraper is something we will not implement * as to do so accurately, we need to run a local webserver. */ public function testParseProducts() { $gateway = new HtmlGateway($this->httpMock); // Map return values to parameter values. // Note: for the unit test, we use the simplest possible HTML, to minimize // complexity when debugging. $map = array(array("http://test.com/list", "<div class='product'>\n <div class='productInfo'>\n <a href='http://test.com/description'>\n Title\n <img src='test.png' alt=''>\n </a>\n </div>\n <p class='pricePerUnit'>£5.00</p>\n </div>"), array("http://test.com/list2", "<div class='product'>\n <div class='productInfo'>\n <a href='http://test.com/description'>\n Title\n <img src='test.png' alt=''>\n </a>\n </div>\n <p class='pricePerUnit'>£5.00</p>\n </div>\n <div class='product'>\n <div class='productInfo'>\n <a href='http://test.com/description2'>\n Another Title\n <img src='test.png' alt=''>\n </a>\n </div>\n <p class='pricePerUnit'>£10.00</p>\n </div>"), array("http://test.com/description", "<div id='information'>\n <h3>Description</h3>\n <div class='productText'>Description</div>\n <h3>Stuff we don't want</h3>\n <div class='productText'>Other stuff we don't want.</div>\n </div>"), array("http://test.com/description2", "<div id='information'>\n <h3>Description</h3>\n <div class='productText'>Another Description</div>\n <h3>Stuff we don't want</h3>\n <div class='productText'>Other stuff we don't want.</div>\n </div>")); // This will be called twice, once to get list, once to get description. $this->httpMock->expects($this->any())->method('getHtml')->will($this->returnValueMap($map)); // Expected output from list1 is one product. // This should be an array within an array, ready to create Product objects. $expectedReturn = array(array("title" => "Title", "size" => 281, "unit_price" => 5.0, "description" => "Description")); // Expected output from list2 is two products. // These should be in an array, ready to create Product objects. $expectedReturn2 = array(array("title" => "Title", "size" => 281, "unit_price" => 5.0, "description" => "Description"), array("title" => "Another Title", "size" => 289, "unit_price" => 10.0, "description" => "Another Description")); // Get both of the sets of product data. $productData = $gateway->getProductData("http://test.com/list"); $productData2 = $gateway->getProductData("http://test.com/list2"); // Test the output. $this->assertEquals($expectedReturn, $productData); $this->assertEquals($expectedReturn2, $productData2); }
/** * Downloads a page * * @param type $url * @param type $referer * @return type */ protected function downloadPage($url, $referer) { # Download webpage $downloaded_page = HttpScraper::getHttp($url, $referer); // logging $this->logActivity("Downloaded: " . $downloaded_page['STATUS']['url']); //var_dump($downloaded_page);die; if ($downloaded_page['STATUS']['http_code'] < 200 || $downloaded_page['STATUS']['http_code'] >= 400) { $curl_error_info = ''; if (isset($downloaded_page['ERROR']) && $downloaded_page['ERROR'] != "") { $curl_error_info = 'CURL Error Information: ' . $downloaded_page['ERROR'] . '.'; } throw new PageDownloadWebbotExecutionException(112, array('url' => $url, 'http_code' => $downloaded_page['STATUS']['http_code'], 'curl_error_info' => $curl_error_info)); } return $downloaded_page; }