$sub_category_string = $html_1->find("div[@class='product_category'] h2 a"); if (!empty($product_string)) { // Look for the product links foreach ($product_string as $product_link) { handle_products($product_link); } } elseif (!empty($sub_category_string)) { // Look for the subcategories foreach ($sub_category_string as $menu_link_2) { $link_2 = $base_url_scheme . SCHEME_TO_HOST . $base_url_host . $menu_link_2->href; // echo "Subcategory: " . $link_2 . "\n"; $html_content_2 = scraperwiki::scrape($link_2); $html_2 = str_get_html($html_content_2); // Look for the product links foreach ($html_2->find("div[@class='productLinkArea'] a") as $product_link) { handle_products($product_link); } } } } function handle_products($product_link) { global $base_url_host, $base_url_scheme, $total; if (!empty($product_link)) { // There are times when Thule will list discontinued products as part of their catalog - test for this... $found = stripos($product_link, "discontinued"); if ($found === false) { $link_3 = $base_url_scheme . SCHEME_TO_HOST . $base_url_host . $product_link->href; $cat_raw = str_replace("/en-US/US/Products/", "", $product_link->href); $cats = dirname($cat_raw); $cat_terms = array("Base-Racks/Feet", "Base-Racks/LoadAccessories", "Base-Racks/LoadBars", "Bike-Carriers/Accessories", "Bike-Carriers/Hitch", "Bike-Carriers/RearDoor", "Bike-Carriers/RoofCarriers", "Bike-Carriers/SpareTire", "Bike-Carriers/TruckBed", "Cargo-Carriers/Bags", "Cargo-Carriers/Baskets", "Cargo-Carriers/Boxes", "Cargo-Carriers/HitchCargo", "Luggage/DaypacksAndMessengers", "Luggage/LaptopAndTablet", "Luggage/LuggageAndDuffels", "Snow-Chains/SnowChains", "Snowsports/Accessories", "Snowsports/HitchSki", "Snowsports/SkiBoxes", "Snowsports/SkiCarriers", "Watersports/Accessories", "Watersports/WatersportCarriers");
$html = str_get_html($html_content); $name_raw = trim($html->find("div[@class='column details_overview'] h2 span", 0)); $name = !empty($name_raw) ? strip_tags($name_raw) : ""; $desc_raw = trim($html->find("div[@class='column details_overview'] h3 span", 0)); $desc = !empty($desc_raw) ? strip_tags($desc_raw) : ""; $price_raw = trim($html->find("div[@class='pricing'] span[@id='phcontent_0_ctl00_lblPriceText']", 0)); $price = strip_tags($price_raw); $price = str_replace("MSRP \$", "", $price); $price = trim(str_replace(" (USD)", "", $price)); $image = $html->find("img[@id='imgProductBomImage_0']", 0)->src; echo "{$name}: {$image}\n"; // Add it to an array. $record = array('id' => $total, 'product_name' => trim($name), 'desciption' => trim($desc), 'price' => $price, 'img' => $image, 'category' => $cat); // Add it to the table. scraperwiki::save_sqlite(array('id'), array($record), "products_support", 2); // Increment the 'id' counter. $total++; } } function cleanProductName($string) { if ($string) { $string = str_replace("-", " ", $string); $str = preg_replace("/[^A-Za-z0-9\\s]/", "", $string); return $str; } } foreach ($products as $product) { handle_products($product); } echo "Number of products: " . $total;
} // Include the library. require 'scraperwiki/simple_html_dom.php'; // We'll need this unchanging element throughout. define("SCHEME_TO_HOST", "://"); // The base URLs. $base_urls = array("http://www.rockymounts.com/Ski_and_Snowboard_Racks_s/154.htm", "http://www.rockymounts.com/bike_racks_s/117.htm", "http://www.rockymounts.com/truck_bike_racks_s/118.htm", "http://www.rockymounts.com/hitch_bike_racks_s/163.htm", "http://www.rockymounts.com/category_s/204.htm", "http://www.rockymounts.com/category_s/202.htm", "http://www.rockymounts.com/racks_lock_cores_s/23.htm", "http://www.rockymounts.com/gear_and_clothing_s/34.htm", "http://www.rockymounts.com/racks_spareparts_s/35.htm"); // Counter used to count the parts and create a unique ID for insertion into the table. $total = 1; // Loop over the categories found on the page. foreach ($base_urls as $base_url) { $cat_content = scraperwiki::scrape($base_url); $cat_html = str_get_html($cat_content); // Look for the product links foreach ($cat_html->find("a[@class='productnamecolor']") as $product_link) { handle_products($product_link->href); } } echo $total; function handle_products($product_link) { global $total; if (!empty($product_link)) { $html_content = scraperwiki::scrape($product_link); $html = str_get_html($html_content); $name = utf8_encode(trim($html->find("span[@itemprop='name']", 0)->plaintext)); $price = trim($html->find("span[@itemprop='price']", 0)->innertext); $code = trim($html->find("span[@class='product_code']", 0)->innertext); $desc = utf8_encode(trim($html->find("span[@itemprop='description']", 0)->plaintext)); $features = trim($html->find("div[@id='ProductDetail_ProductDetails_div2'] ul", 0)); $brand = "Rockymounts";