<?php require_once 'simpletest/browser.php'; $browser =& new SimpleBrowser(); $browser->useCookies(); $source = GetPageSource('https://www.nutritionxcellence.com/protein'); $dom = GetDomObject($source); $xpath = GetXPathObject($dom); $subcCategoriesLinksNodes = $xpath->query('//a'); foreach ($subcCategoriesLinksNodes as $node1) { $ttt = $node1->attributes->item(0)->nodeValue; echo $ttt . '<br />'; } echo 'asdf'; function GetPageSource($URL) { $GLOBALS['browser']->get($URL); $output = $GLOBALS['browser']->getContent(); return $output; } function GetDomObject($pageSource) { $dom = new DOMDocument('1.0'); libxml_use_internal_errors(true); if (!is_null($pageSource)) { $dom->loadHTML($pageSource); return $dom; } else { return; } }
function GetProductsLinks($catURL, &$products) { $listingPageSource = GetPageSource($catURL); if (empty($listingPageSource)) { die('Sorry cURL failed to reterive page source'); } $dom = GetDomObject($listingPageSource); $xpath = GetXPathObject($dom); $imgs = $dom->getElementsByTagName('img'); foreach ($imgs as $img) { $imgParentAnchor = $img->parentNode->getAttribute('href'); $imgParentAnchor = 'http://www.nutritionxcellence.com' . $imgParentAnchor; if (strpos($imgParentAnchor, '/index.php/') != false) { array_push($products, $imgParentAnchor); } } return $listingPageSource; }
function ScrapeProductPage($URL, $catName) { $prodPageSource = GetPageSource($URL); $dom = GetDomObject($prodPageSource); $xPath = GetXPathObject($dom); $Category = ''; $Manufacturer = ''; $ProductTitle = ''; $ProductDescription = ''; $SalesPrice = ''; #(USD) $MSRPPrice = ''; $InStock = 'In stock'; $ProductImages = ''; $NutritionFacts = ''; $OtherIngredients = ''; $AllerginInfo = ''; $Warnings = ''; if (strpos($prodPageSource, 'Notify Me') != false) { $InStock = "Out of stock"; } $Category = $catName; $manufacturerNode = $xPath->query("//*[contains(@class, 'manufacturer')]"); if (!is_null($manufacturerNode)) { $Manufacturer = $manufacturerNode->item(0)->nodeValue; $Manufacturer = trim(str_replace('Manufacturer:', '', $Manufacturer)); } $titleNode = $xPath->query('//*[@id="main"]/div[2]/h1'); if (!is_null($titleNode)) { $title = $titleNode->item(0)->nodeValue; $ProductTitle = trim(str_replace("{$Manufacturer}:", '', $title)); } $descNode = $xPath->query("//*[contains(@class, 'product-description')]"); if (!is_null($descNode)) { $ProductDescription = $descNode->item(0)->nodeValue; $ProductDescription = trim(preg_replace('/Description/', '', $ProductDescription)); $ProductDescription = PrepareField($ProductDescription); } $SalesPriceNode = $xPath->query("//*[contains(@class, 'PricesalesPrice')]"); if (!is_null($SalesPriceNode)) { $SalesPrice = $SalesPriceNode->item(0)->nodeValue; if (preg_match("/\\d+\\.\\d+/", $SalesPrice, $matches)) { $SalesPrice = $matches[0]; } } preg_match("/MSRP:.+?<s>(.+?)<\\/s>/", $prodPageSource, $matchs); if (!is_null($matchs)) { $MSRPPrice = $matchs[1]; } $imageNode = $dom->getElementById('medium-image'); if (!is_null($imageNode)) { $ProductImages = 'http://www.nutritionxcellence.com' . $imageNode->getAttribute('src'); } for ($i = 1; $i <= 4; $i++) { $tabNode = $dom->getElementById("tabs-{$i}"); if (!is_null($tabNode)) { switch ($i) { case 1: $NutritionFacts = PrepareField($tabNode->nodeValue); break; case 2: $OtherIngredients = PrepareField($tabNode->nodeValue); break; case 3: $AllerginInfo = PrepareField($tabNode->nodeValue); break; case 4: $Warnings = PrepareField($tabNode->nodeValue); } } } # Push data to a csv file. $headerArray = array("Category", "Manufacturer", "ProductTitle", "ProductDescription", "SalesPrice", "MSRPPrice", "InStock", "ProductImages", "NutritionFacts", "OtherIngredients", "AllerginInfo", "Warnings"); $rowArray = array("{$Category}", "{$Manufacturer}", "{$ProductTitle}", "{$ProductDescription}", "{$SalesPrice}", "{$MSRPPrice}", "{$InStock}", "{$ProductImages}", "{$NutritionFacts}", "{$OtherIngredients}", "{$AllerginInfo}", "{$Warnings}"); PushDataToCSV('data.csv', $headerArray, $rowArray); echo $title . ' -- Done' . '<br />'; }