예제 #1
0
 public function testCanHandle404()
 {
     $this->scraper->setHttpClient($this->getHttpClient([new GuzzleResponse(404, ['Content-Type' => 'text/html; charset=UTF-8'], '')]));
     $output = $this->scraper->scrape('http://www.sainsburys.mock/shop/gb/not-found');
     $this->assertInternalType('object', $output, "Scrape must return an object");
     $this->assertInternalType('array', $output->results, "Scrape must an array of results");
     $this->assertEquals(0, count($output->results), "The results contains the correct number of products");
     $this->assertEquals("0.0", $output->total, "The total unit price is correct");
 }
예제 #2
0
 public function testGetItemData()
 {
     $dom = Scraper::getDom($this->html);
     $itemData = Scraper::getItemData($dom);
     $this->assertTrue(is_array($itemData));
     $this->assertArrayHasKey('title', $itemData);
     $this->assertArrayHasKey('size', $itemData);
     $this->assertArrayHasKey('unit_price', $itemData);
     $this->assertArrayHasKey('description', $itemData);
     $this->assertEquals($itemData['title'], 'Sainsbury\'s Apricot Ripe & Ready x5');
     $this->assertEquals($itemData['unit_price'], '3.00');
     $this->assertEquals($itemData['description'], 'Apricots');
 }
예제 #3
0
파일: run.php 프로젝트: kimon89/scraper
use Sainsburys\Scraper;
$url = "http://www.sainsburys.co.uk/webapp/wcs/stores/servlet/CategoryDisplay?listView=true&orderBy=FAVOURITES_FIRST&parent_category_rn=12518&top_category=12518&langId=44&beginIndex=0&pageSize=20&catalogId=10137&searchTerm=&categoryId=185749&listId=&storeId=10151&promotionId=#langId=44&storeId=10151&catalogId=10137&categoryId=185749&parent_category_rn=12518&top_category=12518&pageSize=20&orderBy=FAVOURITES_FIRST&searchTerm=&beginIndex=0&hideFilters=true";
//get the html
$curlData = Scraper::curlPage($url);
//create a dom object out of the html
$dom = Scraper::getDom($curlData);
if ($dom) {
    $data = ['results' => [], 'total' => 0];
    //get all the product links
    $elems = $dom->find('.productInfo h3 a');
    //we don't need that anymore so clear the memory
    $dom->clear();
    //iterate through each link in order to get the data
    foreach ($elems as $k => $el) {
        $itemData = [];
        try {
            $curlData = Scraper::curlPage($el->href);
            $dom = Scraper::getDom($curlData);
            $itemData = Scraper::getItemData($dom);
            $dom->clear();
            $data['results'][] = $itemData;
            //add up to  the total price
            $data['total'] += $itemData['unit_price'];
        } catch (Exception $e) {
            print 'Url: ' . $el->href . ' - ' . $e->getMessage();
        }
    }
    print json_encode($data);
} else {
    throw new Exception('Dom failed to load');
}