// $url = "http://repucom.net/contact/usa/";
// $url = "http://stdcxx.apache.org/doc/stdlibug/26-1.html";
// $url = "http://www.archifind.co.il/%D7%9E%D7%A2%D7%A6%D7%91%D7%99-%D7%A4%D7%A0%D7%99%D7%9D?page=1&start_10=0&start_20=0&start_30=0&code=&expertise=0";
// $url = "http://www.localbiz.co.il/Business/Biz-Category.aspx?PageNum=1&CategoryId=48&CityId=-1&Can=%D7%90%D7%99%D7%A0%D7%98%D7%A8%D7%A0%D7%98+-+%D7%A9%D7%99%D7%95%D7%95%D7%A7+%D7%95%D7%A4%D7%A8%D7%A1%D7%95%D7%9D&Cin=";
// $url = "http://www.hasut.co.il/24683";
// $url = "http://we.keepitsimple.co.il/user/87";
$emailList = array();
$phoneList = array();
$emailRegex = "/([A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,})/i";
$phoneRegex = array("/(\\d{3}[- ]\\d{3}[- ]\\d{4})/", "/[^a-zA-Z0-9](\\d{10})[^a-zA-Z0-9]/", "/(\\(\\d{3}\\)[ \\/]+\\d{3}[- ]\\d{4})/", "/(\\+?\\d+[- ]\\d{3}[- ]\\d{3}[- ]\\d{4})/", "/(\\d{3}[- ]\\d{7})/");
$urlRegex = array("/<a[^>]+href\\s*=\\s*\"(\\S+page=\\d+\\S+)\"/", "/<a[^>]+href\\s*=\\s*[\"'](\\S+PageNum=\\d+\\S+)[\"']/", "/<a[^>]+href\\s*=\\s*\"(\\S+p=\\d+\\S+)\"/");
$urlList = array($url);
for ($i = 0; $i < count($urlList); $i++) {
    // error_log("Checking {$urlList[$i]}\n", 3, "error_log");
    // print "Checking {$urlList[$i]}\n";
    $htmlCode = getHTMLCode($urlList[$i]);
    getEmailFromHTML($htmlCode, $emailRegex, $emailList);
    getNumbersFromHTML($htmlCode, $phoneRegex, $phoneList);
    if ($pageTracing) {
        getURLFromHTML($url, $htmlCode, $urlRegex, $urlList);
    }
    if ($i == 100) {
        break;
    }
}
writeToFile($fileName, $emailList, $phoneList);
// ---- test
// $htmlCode = getHTMLCode($url);
// $dom = new DOMDocument();
// $dom -> loadHTML($htmlCode);
// $xpath = new DOMXPath($dom);
Ejemplo n.º 2
0
<?php

require "core.php";
//take the url from user for which data set has to be created
$url = "http://www.jabong.com/home-living/?q=home%20%26%20furniture&qc=home%20%26%20furniture&r=1";
// to get html code of the url
$html = getHTMLCode($url);
$regex = '/<img src="(?P<img>[^"]*)" width="176" height="255" alt="" title="" class="itm-img" \\/>/';
preg_match_all($regex, $html, $image);
/* $regex='/class="offer-in txt-up clr-fff fs11">(?P<oldOrNew>[^<]*)<\/small>/';
	preg_match_all($regex,$html,$new); */
$regex = '/<span class="qa-brandName title mt30 c999 prod-ellipsis">(?P<GG>[^<]*)<\\/span>/';
preg_match_all($regex, $html, $g);
$regex = '/<span class="qa-brandTitle fs11 c999 prod-ellipsis">(?P<TITLE>[^<]*)<\\/span>/';
preg_match_all($regex, $html, $title);
$regex = '/<strong class="fs16 qa-price">(?P<markedPrice>[^<]*)<\\/strong>/';
preg_match_all($regex, $html, $m_price);
$i = 0;
echo "<ol>";
foreach (@$image[img] as $a) {
    echo "<li><img src = '{$a}'>";
    echo @$new[oldOrNew][$i];
    echo @$g[GG][$i];
    echo @$title[TITLE][$i];
    echo @$m_price[markedPrice][$i];
    echo "</li><br>";
    $i++;
}
echo "</ol>";
/* $x = "hi"; 
	echo '$x'; print $x