function scrap($db, $y, $m, $d, $time) { $postFields = array('strYear' => $y, 'strMonth' => $m, 'strDay' => $d); try { $url = 'http://g1.taisugar.com.tw/Sugar/Sugar_show_His.asp'; $sugar = curl($url, $postFields); $packtSugarXpath = returnXPathObject($sugar); $td = $packtSugarXpath->query('//td'); // return DOMNodeList $td_title = $td->length; $td_first = 13; $td_second = 14; $td_third = 15; $td_diff = 7; if ($td_title > 13) { $pid = filter_var($td->item($td_first)->nodeValue, FILTER_SANITIZE_NUMBER_FLOAT, FILTER_FLAG_ALLOW_FRACTION); $kg = filter_var($td->item($td_second)->nodeValue, FILTER_SANITIZE_NUMBER_FLOAT, FILTER_FLAG_ALLOW_FRACTION); $bag = filter_var($td->item($td_third)->nodeValue, FILTER_SANITIZE_NUMBER_FLOAT, FILTER_FLAG_ALLOW_FRACTION); $result['pid'] = clean($pid); $result['pricePerKg'] = $kg; $result['pricePerBag'] = $bag; $result['time'] = $time; while ($result['pid'] != '01021050') { $td_first = $td_first + $td_diff; $td_second = $td_second + $td_diff; $td_third = $td_third + $td_diff; $pid = filter_var($td->item($td_first)->nodeValue, FILTER_SANITIZE_NUMBER_FLOAT, FILTER_FLAG_ALLOW_FRACTION); $kg = filter_var($td->item($td_second)->nodeValue, FILTER_SANITIZE_NUMBER_FLOAT, FILTER_FLAG_ALLOW_FRACTION); $bag = filter_var($td->item($td_third)->nodeValue, FILTER_SANITIZE_NUMBER_FLOAT, FILTER_FLAG_ALLOW_FRACTION); $result['pid'] = clean($pid); $result['pricePerKg'] = $kg; $result['pricePerBag'] = $bag; $result['time'] = $time; } insertDB($db, $result['pid'], $result['pricePerKg'], $result['pricePerBag'], $result['time']); print_r($result); } else { // echo 'no data input'; } } catch (Exception $ex) { echo "failed </br>"; } }
return $results; } function returnXPathObject($item) { $xmlPageDom = new DomDocument(); @$xmlPageDom->loadHTML($item); $xmlPageXPath = new DOMXPath($xmlPageDom); return $xmlPageXPath; } // prepare and bind $stmt = $conn->prepare("INSERT INTO Products (title, price, payment, shippingOpt, shippingTime, bluetooth, brand, prdCondition, model, weight) VALUES (?,?,?,?,?,?,?,?,?,?)"); $stmt->bind_param("ssssssssss", $titledb, $pricedb, $paymentdb, $shippingOptdb, $shippingTimedb, $bluetoothdb, $branddb, $prdConditiondb, $modeldb, $weightdb); for ($x = 1; $x < 100; $x++) { $productPage = file_get_contents("/home/spontaneous/Desktop/kaymu/mob/page{$x}"); $products = array(); $productPageXPath = returnXPathObject($productPage); $title = $productPageXPath->query('//span[@class="prd-title"]'); if ($title->length > 0) { $products['title'] = trim($title->item(0)->nodeValue); } //$price = $packtPageXPath->query('//span[@id="price_box"]'); $price = $productPageXPath->query('//*[@id="price_box"]'); if ($price->length > 0) { //$packtBook['price'] = trim($overview->item(0)->nodeValue); $products['price'] = trim($price->item(0)->nodeValue); } $shiping = $productPageXPath->query('//div[@class="boxAttribute rtl-right"]'); if ($shiping->length > 0) { for ($i = 0; $i < $shiping->length - 1; $i++) { $children = $shiping->item($i)->childNodes; $ship = trim($children->item(1)->nodeValue);
// crawl the page with product list and scrap page link for each product <?php function returnXPathObject($item) { $xmlPageDom = new DomDocument(); @$xmlPageDom->loadHTML($item); $xmlPageXPath = new DOMXPath($xmlPageDom); return $xmlPageXPath; } $file = 'links.txt'; for ($i = 1; $i < 98; $i++) { $filename = '/home/spontaneous/Desktop/kaymu/mobile/page' . $i; $handle = fopen($filename, 'r'); $webPage = fread($handle, filesize($filename)); //$webPage = file_get_contents($filename, FILE_USE_INCLUDE_PATH); $packtPageXPath = returnXPathObject($webPage); fclose($handle); $anchor = $packtPageXPath->query('//*[@id="productsCatalog"]/div/div/a'); if ($anchor->length > 0) { for ($j = 0; $j < $anchor->length; $j++) { $link = "http://www.kaymu.com.np" . $anchor->item($j)->getAttribute('href') . "\n"; $handle = fopen('links.txt', 'a'); fwrite($handle, $link); fclose($handle); } } }
$gosupage1 = getPage('http://www.gosugamers.net/dota2/rankings?page=1'); $gosuXPath1 = returnXPathObject($gosupage1); //GET IMPORTANT DATA FROM XPATH OBJECT $teamName1 = $gosuXPath1->query('//span[@class="main no-game"]'); //query for team name $teamElo1 = $gosuXPath1->query('//td[@class="numbers"]'); //query for team Elo $i = 0; while ($teamName1->item($i)->nodeValue) { //while there are still teams on the page if ($teamName1->item($i)->nodeValue) { $scrapedData[$teamName1->item($i)->nodeValue] = str_replace(',', '', $teamElo1->item($i)->nodeValue); //replace commas from elo value eg) 1,000 } $i++; } //SAME THING BUT SECOND PAGE (TEAMS 51-100) $gosupage2 = getPage('http://www.gosugamers.net/dota2/rankings?page=2'); $gosuXPath2 = returnXPathObject($gosupage2); $teamName2 = $gosuXPath2->query('//span[@class="main no-game"]'); //query for team name $teamElo2 = $gosuXPath2->query('//td[@class="numbers"]'); //query for team Elo $i = 0; while ($teamName2->item($i)->nodeValue) { if ($teamName2->item($i)->nodeValue) { $scrapedData[$teamName2->item($i)->nodeValue] = str_replace(',', '', $teamElo2->item($i)->nodeValue); } $i++; } print_r($scrapedData);
return $result; } //CONVERT TO XPATH OBJECT $scrapedData = array(); function returnXPathObject($item) { $xmlPageDom = new DOMDocument(); //instantiate @$xmlPageDom->loadHTML($item); //load $xmlPageXPath = new DOMXPath($xmlPageDom); //instantiate xpath object return $xmlPageXPath; } $loungepage = getPage('http://www.dota2lounge.com/'); $loungeXPath = returnXPathObject($loungepage); echo 'UPCOMING GAMES:<br>'; $match = $loungeXPath->query('//div[@class="match"]'); $i = 0; $j = 0; while ($match->item($i)->nodeValue) { $teams = $loungeXPath->query('//div[@class="teamtext"]'); $team1 = substr($teams->item($j)->nodeValue, 0, -3); $odds1 = substr($teams->item($j)->nodeValue, -3, -1); $team2 = substr($teams->item($j + 1)->nodeValue, 0, -3); $odds2 = substr($teams->item($j + 1)->nodeValue, -3, -1); echo $team1 . ' (' . $odds1 . '%) ' . $team2 . ' (' . $odds2 . '%)<br>'; echo 'ACTUAL ODDS: '; $actualOdds = calculateOdds($team1, $team2, $conn); if ($actualOdds < 0.01) { $actualOdds = 'N/A';