// $url = "http://repucom.net/contact/usa/"; // $url = "http://stdcxx.apache.org/doc/stdlibug/26-1.html"; // $url = "http://www.archifind.co.il/%D7%9E%D7%A2%D7%A6%D7%91%D7%99-%D7%A4%D7%A0%D7%99%D7%9D?page=1&start_10=0&start_20=0&start_30=0&code=&expertise=0"; // $url = "http://www.localbiz.co.il/Business/Biz-Category.aspx?PageNum=1&CategoryId=48&CityId=-1&Can=%D7%90%D7%99%D7%A0%D7%98%D7%A8%D7%A0%D7%98+-+%D7%A9%D7%99%D7%95%D7%95%D7%A7+%D7%95%D7%A4%D7%A8%D7%A1%D7%95%D7%9D&Cin="; // $url = "http://www.hasut.co.il/24683"; // $url = "http://we.keepitsimple.co.il/user/87"; $emailList = array(); $phoneList = array(); $emailRegex = "/([A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,})/i"; $phoneRegex = array("/(\\d{3}[- ]\\d{3}[- ]\\d{4})/", "/[^a-zA-Z0-9](\\d{10})[^a-zA-Z0-9]/", "/(\\(\\d{3}\\)[ \\/]+\\d{3}[- ]\\d{4})/", "/(\\+?\\d+[- ]\\d{3}[- ]\\d{3}[- ]\\d{4})/", "/(\\d{3}[- ]\\d{7})/"); $urlRegex = array("/<a[^>]+href\\s*=\\s*\"(\\S+page=\\d+\\S+)\"/", "/<a[^>]+href\\s*=\\s*[\"'](\\S+PageNum=\\d+\\S+)[\"']/", "/<a[^>]+href\\s*=\\s*\"(\\S+p=\\d+\\S+)\"/"); $urlList = array($url); for ($i = 0; $i < count($urlList); $i++) { // error_log("Checking {$urlList[$i]}\n", 3, "error_log"); // print "Checking {$urlList[$i]}\n"; $htmlCode = getHTMLCode($urlList[$i]); getEmailFromHTML($htmlCode, $emailRegex, $emailList); getNumbersFromHTML($htmlCode, $phoneRegex, $phoneList); if ($pageTracing) { getURLFromHTML($url, $htmlCode, $urlRegex, $urlList); } if ($i == 100) { break; } } writeToFile($fileName, $emailList, $phoneList); // ---- test // $htmlCode = getHTMLCode($url); // $dom = new DOMDocument(); // $dom -> loadHTML($htmlCode); // $xpath = new DOMXPath($dom);
<?php require "core.php"; //take the url from user for which data set has to be created $url = "http://www.jabong.com/home-living/?q=home%20%26%20furniture&qc=home%20%26%20furniture&r=1"; // to get html code of the url $html = getHTMLCode($url); $regex = '/<img src="(?P<img>[^"]*)" width="176" height="255" alt="" title="" class="itm-img" \\/>/'; preg_match_all($regex, $html, $image); /* $regex='/class="offer-in txt-up clr-fff fs11">(?P<oldOrNew>[^<]*)<\/small>/'; preg_match_all($regex,$html,$new); */ $regex = '/<span class="qa-brandName title mt30 c999 prod-ellipsis">(?P<GG>[^<]*)<\\/span>/'; preg_match_all($regex, $html, $g); $regex = '/<span class="qa-brandTitle fs11 c999 prod-ellipsis">(?P<TITLE>[^<]*)<\\/span>/'; preg_match_all($regex, $html, $title); $regex = '/<strong class="fs16 qa-price">(?P<markedPrice>[^<]*)<\\/strong>/'; preg_match_all($regex, $html, $m_price); $i = 0; echo "<ol>"; foreach (@$image[img] as $a) { echo "<li><img src = '{$a}'>"; echo @$new[oldOrNew][$i]; echo @$g[GG][$i]; echo @$title[TITLE][$i]; echo @$m_price[markedPrice][$i]; echo "</li><br>"; $i++; } echo "</ol>"; /* $x = "hi"; echo '$x'; print $x