public function parseProductDetails(XPathHelper $xph) { $record = reset($xph->xpSubQueries("//div[@id='ProductDetails']//div[@class='BlockContent']", array('product.name' => 'h2', 'product.price_retail' => "div[@class='ProductMain']/div[@class='ProductDetailsGrid']/div[@class='DetailRow RetailPrice']/div[@class='Value']", 'product.price_listed' => "div[@class='ProductMain']/div[@class='ProductDetailsGrid']/div[@class='DetailRow PriceRow']/div[@class='Value']/em[@class='ProductPrice VariationProductPrice']", 'product.sku' => "div[@class='ProductMain']/div[@class='ProductDetailsGrid']/div[@class='DetailRow ProductSKU']/div[@class='Value']"))); $record['product.description'] = $xph->queryValue("//div[@id='ProductDescription']/div[@class='ProductDescriptionContainer']"); /** TODO: parse additional data fields for weight, quantity, brand,rating.. * //div[@id='ProductDetails']//div[@class='BlockContent']/div[@class='ProductMain']/div[@class='ProductDetailsGrid']/div[@class='DetailRow'] * will return a list of div[@class='Label'] and div[@class='Value'] children */ return $record; }
public function parsePageType(XPathHelper $xph) { if ($xph->xpQuery("//div[@class='srListing']") !== null) { return "searchResult"; } if (true) { //FIXME detect product return "product"; } //FIXME: detect no result }
public function openHref($href, $pageUrl = NULL) { $bak = XPathHelper::$_curlopts; XPathHelper::$_curlopts[CURLOPT_USERAGENT] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11"; $res = parent::openHref($href); XPathHelper::$_curlopts = $bak; return $res; }
public function parseProductDetails(XPathHelper $xph) { //broken page with error message if (substr($xph->queryValue("//div[@class='itemPageWrapper']/text()[5]"), 0, 9) === 'Exception') { return array(); } $result = array(); //Discontinued items have no price. if ($xph->queryValue("//div[@id='unluckyInfo']/div[@id='apology']/h2") === "We're Sorry, We No Longer Carry This Item") { $result = $this->array_merge_first_record($result, $xph->xpSubQueries("//div[@id='unluckyInfo']/div[@id='apology']", array('product.sku_and_upc' => "//div[@class='itemDataCode']"))); return $result; } //regular items $result = $this->array_merge_first_record($result, $xph->xpSubQueries("//div[@class='imgBorderBgNew']", array('product.sku' => "//div[@class='itemDataCode'][span]", 'product.upc' => "//div[@class='itemDataCode'][not(span)]"))); //TRICK: putting array arround collapseLabelValuePairs() to make it compatible with array_merge_first_record() $result = $this->array_merge_first_record($result, array($this->collapseLabelValuePairs($xph->xpSubQueries("//div[@id='tabInfoContent']/table/tbody/tr/td/table/tbody/tr/td/div[@class='itemData']", array('label' => 'b', 'value' => 'text()'))))); $result = $this->array_merge_first_record($result, $xph->xpSubQueries("//div[@id='tabPricingContent']/table/tbody/tr/td/div[@class='pricingDisplay']", array('product.price_retail' => "table/tbody/tr/td/div[@class='retailPrice']", 'product.price_listed' => "table/tbody/tr/td/div[@class='salePrice']|table/tbody/tr/td/div[@class='specialPrice']"))); $result['product.url'] = $xph->getUrl(); return $result; }
public function parseProductDetails(XPathHelper $xph) { return $xph->xpSubQueries("//div[@id='item-spc']", array('product.name' => "div[@id='item-summary']/h1[@class='item-name']", 'product.brand' => "div[@id='item-summary']/h2[@class='item-brand']", 'product.price_listed' => "div[@id='item-attributes']/div[@id='item-price']/div[@id='swanson-price']/b[@class='price']", 'product.price_retail' => "div[@id='item-attributes']/div[@id='item-price']/div[@id='retail-price']/b[@class='price']")); }
public function parseOfferSellerListPagination(XPathHelper $xph) { return $xph->queryValue("//span[@id='online-pagination']/a[contains(.,'Next')]/@href"); }
public function parsePageType(XPathHelper $xph) { if (null !== $xph->xpQuery("//*[@id='noResultsTitle']")) { return "searchResult"; } if (null !== $xph->xpQuery("//*[@id='s-result-count']")) { return "searchResult"; } if (null !== $xph->xpQuery("//*[@id='atfResults']")) { return "searchResult"; } if (null !== $xph->xpQuery("//div[@id='product-title_feature_div']/div[@class='buying'] | //form[@id='handleBuy']/div[@class='buying']")) { return "product"; } // TODO: fix this //if ('Robot Check' == $xph->queryValue("//html/body/title/text()", 1)) return "captcha"; // default, save for analysis $ci =& get_instance(); $html_contents = "\n\n----------------------------------------------------------\n" . date('Y-m-d H:i:s') . "\n" . $xph->dump(); file_put_contents($ci->config->item('file_root') . "output.amazon.failed.parse.txt", $html_contents, FILE_APPEND); throw new Exception(__FUNCTION__ . ": unexpected page type at " . $xph->__toString()); }
public function parseSearchResult(XPathHelper $xph) { return $xph->xpSubQueries("//div[@class='category-products']/ul[@class]/li", array('product.name' => "h2[@class='product-name']", 'product.url' => "h2[@class='product-name']/a/@href", 'product.price_listed' => "div[@class='price-box']/p[@class='special-price']/span[@class='price']", 'product.price_retail' => "div[@class='price-box']/p[@class='old-price']/span[@class='price']")); }
public function parseSearchResult(XPathHelper $xph) { return $xph->xpSubQueries("//div[@id='categoryTable']/div[@class='row']", array('product.name' => "div[@class='rowInfo']/div[@class='rowWrapper']/div[@class='rowInfoA']/h3/a", 'product.url' => "div[@class='rowInfo']/div[@class='rowWrapper']/div[@class='rowInfoA']/h3/a/@href", 'product.price_listed' => "div[@class='rowInfo']/div[@class='rowWrapper']/div[@class='rowInfoB']/div/div[@id='priceLabelContainer']/div[@class='saleValue-Price-Search']", 'product.price_retail' => "div[@class='rowInfo']/div[@class='rowWrapper']/div[@class='rowInfoB']/div/div[@id='priceLabelContainer']/div[@class='listRegular-Price'][1]")); }
public function parseSearchResult(XPathHelper $xph) { return $xph->xpSubQueries("//div[@id='display-results-content']/div[@class='prodSlotWide']", array('product.name' => "p[@class='description']", 'product.url' => "p[@class='description']/a/@href", 'product.rating' => "div[@class='details']/div[@class='starsAndPrice']/span/a/img/@title", 'product.price_listed' => "div[@class='details']/div[@class='starsAndPrice']/span[@class='price']/text()[1]", 'product.price_retail' => "div[@class='details']/div[@class='starsAndPrice']/span[@class='price']/span[@class='crossed-out-price']")); }
?> </tr> </table> </td> <td title="<?php echo $v_rkey; ?> "><?php echo $v_rdom; ?> </td> <?php if ($sitenodelist->length != 0) { ?> <td title="<?php echo XPathHelper::listSiteNodeValues($sitenodelist); ?> "><?php echo FLANG_H_SITE; ?> </td> <?php } else { ?> <td></td> <?php } ?> <td><span class="IPv_<?php echo $v_ipv; ?>
function main() { $opt = getopt('x:t:i:k:hv:p:'); if (isset($opt['h'])) { fatal_error($help); } if (isset($opt['v'])) { XPathHelper::$_debug = intval($opt['v']); } if (isset($opt['p'])) { XPathHelper::$_proxy = $opt['p']; } if (isset($opt['t'])) { $target = $opt['t']; } else { $target = null; } if (!isset($opt['x'])) { fatal_error("-x argument is mandatory"); } if (isset($opt['s'])) { XPathHelper::$_sleep = (int) $opt['s']; } if (isset($opt['i'])) { $input = explode("\n", file_get_contents($opt['i'])); if (($tail = array_pop($input)) !== '') { array_push($input, $tail); } } else { $input = array($target); } if (isset($opt['k'])) { $keyword = $opt['k']; } else { $keyword = ''; } switch ($opt['x']) { case 'product-search': productSearch($keyword, $target); break; case 'product-details': foreach ($input as $url) { productDetails($keyword, $url); } break; case 'product-details2': foreach ($input as $url) { productDetails2($url); } break; case 'product-parse': foreach ($input as $url) { productParse($url); } break; case 'product-listall': productfinder_listAll($target); break; case 'product-search-upc': if (isset($opt['k'])) { priceCheck(array($keyword), $target); } else { priceCheck($input, $target); } break; case 'product-search-upc2': if (isset($opt['k'])) { priceCheck2(array($keyword), $target); } else { priceCheck2($input, $target); } break; case 'product-search-mpn': if (isset($opt['k'])) { fatal_error("unexpected -k option for action 'product-search-mpn'"); } else { priceCheckMPN($input, $target); } break; case 'fetch': XPathHelper::$_curlopts[CURLOPT_FOLLOWLOCATION] = true; $xph = new XPathHelper($target); echo $xph->dump(); break; default: fatal_error("unexpected -x action: " . $opt['x']); } }
public function parseSearchResult(XPathHelper $xph) { return $xph->xpSubQueries("//div[@class='productList']/div[1]/div[@class='itemCell']", array('product.name' => "div[@class='itemText']/div[@class='wrapper']/a/span[@class='itemDescription' and starts-with(@id, 'title')]", 'product.url' => "div[@class='itemText']/div[@class='wrapper']/a/@href", 'product.sku' => "div[@class='itemText']/ul[@class='featureList']/li[contains(., 'Model #:')]/text()", 'product.rating' => "div[@class='itemGraphics']/a[@class='itemRating']/@title")); }