예제 #1
0
 /**
  * Test getting product data from valid HTML.
  *
  * Note: This is testing that HTML is parsed correctly.
  * Testing the actual scraper is something we will not implement
  * as to do so accurately, we need to run a local webserver.
  */
 public function testParseProducts()
 {
     $gateway = new HtmlGateway($this->httpMock);
     // Map return values to parameter values.
     // Note: for the unit test, we use the simplest possible HTML, to minimize
     // complexity when debugging.
     $map = array(array("http://test.com/list", "<div class='product'>\n                     <div class='productInfo'>\n                         <a href='http://test.com/description'>\n                             Title\n                             <img src='test.png' alt=''>\n                         </a>\n                     </div>\n                     <p class='pricePerUnit'>&pound;5.00</p>\n                 </div>"), array("http://test.com/list2", "<div class='product'>\n                     <div class='productInfo'>\n                         <a href='http://test.com/description'>\n                             Title\n                             <img src='test.png' alt=''>\n                         </a>\n                     </div>\n                     <p class='pricePerUnit'>&pound;5.00</p>\n                 </div>\n                 <div class='product'>\n                     <div class='productInfo'>\n                         <a href='http://test.com/description2'>\n                             Another Title\n                             <img src='test.png' alt=''>\n                         </a>\n                     </div>\n                     <p class='pricePerUnit'>&pound;10.00</p>\n                 </div>"), array("http://test.com/description", "<div id='information'>\n                     <h3>Description</h3>\n                     <div class='productText'>Description</div>\n                     <h3>Stuff we don't want</h3>\n                     <div class='productText'>Other stuff we don't want.</div>\n                 </div>"), array("http://test.com/description2", "<div id='information'>\n                     <h3>Description</h3>\n                     <div class='productText'>Another Description</div>\n                     <h3>Stuff we don't want</h3>\n                     <div class='productText'>Other stuff we don't want.</div>\n                 </div>"));
     // This will be called twice, once to get list, once to get description.
     $this->httpMock->expects($this->any())->method('getHtml')->will($this->returnValueMap($map));
     // Expected output from list1 is one product.
     // This should be an array within an array, ready to create Product objects.
     $expectedReturn = array(array("title" => "Title", "size" => 281, "unit_price" => 5.0, "description" => "Description"));
     // Expected output from list2 is two products.
     // These should be in an array, ready to create Product objects.
     $expectedReturn2 = array(array("title" => "Title", "size" => 281, "unit_price" => 5.0, "description" => "Description"), array("title" => "Another Title", "size" => 289, "unit_price" => 10.0, "description" => "Another Description"));
     // Get both of the sets of product data.
     $productData = $gateway->getProductData("http://test.com/list");
     $productData2 = $gateway->getProductData("http://test.com/list2");
     // Test the output.
     $this->assertEquals($expectedReturn, $productData);
     $this->assertEquals($expectedReturn2, $productData2);
 }
예제 #2
0
 /**
  * Downloads a page
  *
  * @param type $url
  * @param type $referer
  * @return type 
  */
 protected function downloadPage($url, $referer)
 {
     # Download webpage
     $downloaded_page = HttpScraper::getHttp($url, $referer);
     // logging
     $this->logActivity("Downloaded: " . $downloaded_page['STATUS']['url']);
     //var_dump($downloaded_page);die;
     if ($downloaded_page['STATUS']['http_code'] < 200 || $downloaded_page['STATUS']['http_code'] >= 400) {
         $curl_error_info = '';
         if (isset($downloaded_page['ERROR']) && $downloaded_page['ERROR'] != "") {
             $curl_error_info = 'CURL Error Information: ' . $downloaded_page['ERROR'] . '.';
         }
         throw new PageDownloadWebbotExecutionException(112, array('url' => $url, 'http_code' => $downloaded_page['STATUS']['http_code'], 'curl_error_info' => $curl_error_info));
     }
     return $downloaded_page;
 }