Esempio n. 1
0
$website = $argv[2];
// Define it as constant
// TODO : Remove this constant
define("SITE", $website);
define("GOOGLE_BASE_URL", "https://www.google.com/search?&q=");
define("GOOGLE_SEARCH_URL", "https://www.googleapis.com/customsearch/v1element?" . "key=AIzaSyCVAXiUzRYsML1Pv6RwSG1gunmMikTzQqY&" . "rsz=filtered_cse&" . "num=10&" . "hl=en&" . "prettyPrint=false&" . "source=gcsc&" . "gss=.com&" . "sig=ee93f9aae9c9e9dba5eea831d506e69a&" . "cx=000351285113061488967:p1lh-gcxv08&" . "q=_QUERY&" . "sort=&" . "googlehost=www.google.com&" . "oq=_QUERY&" . "gs_l=partner.12...25371.25371.0.26346.1.1.0.0.0.0.170.170.0j1.1.0.gsnos%2Cn%3D13...0.1981j3853693j3..1ac.1.25.partner..1.0.0.Wsa_5yXJf84&" . "callback=google.search.Search.apiary15963");
//defining Output CSV file options
$csv['dir'] = OUTPUT_DIR;
$csv['file'] = SITE . "_results_" . date("Y-m-d_H-i-s", time()) . ".csv";
$csv['columns'] = array("Page Title", "Query", "Google URL", SITE . " URL");
//Log file based on running script , website and timestamp
$log_file = $script_name . "_" . SITE . "_" . date("Y-m-d_H-i-s", time()) . ".log";
//Scraper object
$scraper = new Scraper(USERAGENT, MIN_SLEEP_TIME, MAX_SLEEP_TIME, $input_file, $csv, $log_file);
//Start crawling
$scraper->crawl();
//Finish crawling
fwrite($scraper->log, "[END]\r\n\r\n");
echo "\nCSV file created: " . $csv['dir'] . "/" . $csv['file'] . "\n";
echo "\nDONE\n";
/**
*@desc
* Scraper class
*/
class Scraper
{
    public function __construct($useragent, $min_sleep_time, $max_sleep_time, $input_file, $csv, $log_file)
    {
        //CURL
        $this->curl = new CurlClient();
        $this->curl->set_user_agent($useragent);