/**
  * Scrape the given topic url for all the posts in a topic and return as xml.
  */
 private function scrape_topic($link)
 {
     $scraper = new Scraper($link);
     $scraper->run();
     $topic_scraper = new GoogleGroupsTopicScraper($scraper->html);
     $topic = $topic_scraper->run();
     $i = 0;
     $xml = '';
     if (is_array($topic)) {
         foreach ($topic as $detail) {
             $xml .= "      <post idx=\"{$i}\">\n";
             $xml .= '        <author>' . $detail['author'] . "</author>\n";
             $xml .= '        <email>' . $detail['email'] . "</email>\n";
             $xml .= '        <date>' . $detail['date'] . "</date>\n";
             $xml .= '        <timestamp>' . $detail['timestamp'] . "</timestamp>\n";
             $xml .= "        <body>\n";
             $xml .= "<![CDATA[\n" . $detail['body'] . "\n]]>\n";
             $xml .= "        </body>\n";
             $xml .= "      </post>\n";
             $i++;
         }
     } else {
         print "ERROR: bad topic (url={$link})\n";
     }
     return $xml;
 }
 /**
  * Execute the console command.
  *
  * @return void
  */
 public function fire()
 {
     // message
     $data = array('type' => $this->argument('type'), 'id' => $this->argument('id'), 'date' => $this->argument('date'));
     // fire
     $scraper = new Scraper();
     $scraper->execute($data);
 }
 protected function execute(InputInterface $input, OutputInterface $output)
 {
     $default_url = "http://www.sainsburys.co.uk/webapp/wcs/stores/servlet/CategoryDisplay?listView=true&orderBy=FAVOURITES_FIRST&parent_category_rn=12518&top_category=12518&langId=44&beginIndex=0&pageSize=20&catalogId=10137&searchTerm=&categoryId=185749&listId=&storeId=10151&promotionId=#langId=44&storeId=10151&catalogId=10137&categoryId=185749&parent_category_rn=12518&top_category=12518&pageSize=20&orderBy=FAVOURITES_FIRST&searchTerm=&beginIndex=0&hideFilters=true";
     $scraper = new Scraper($default_url);
     $scraped_products = $scraper->scrape();
     if ($input->getOption('prettyprint')) {
         $pretty_print = True;
     } else {
         $pretty_print = False;
     }
     $json = $scraper->construct_json($scraped_products, $pretty = $pretty_print);
     $output->writeln($json);
 }
 public function run()
 {
     $params = array('q' => $this->q, 'page' => 1, 'rpp' => 100);
     if (!empty($this->since)) {
         $params['since'] = date('Y-m-d', strtotime($this->since));
     }
     if (!empty($this->locale)) {
         $params['locale'] = $this->locale;
     }
     if (!empty($this->geocode)) {
         $params['geocode'] = $this->geocode;
     }
     for ($i = 1; $i < 10; $i++) {
         $params['page'] = $i;
         $this->url = 'http://search.twitter.com/search.json?' . http_build_query($params, '', '&');
         $response = Scraper::scrape($this->url);
         if ($json = json_decode($response)) {
             if (isset($json->total)) {
                 $this->result = $json->total;
                 break;
             }
         }
     }
     return $this->result;
 }
 public function testFindSum()
 {
     $json = Scraper::start();
     $array = json_decode($json, true);
     //print_r($array['total']);
     $input = $array['results'];
     $total = $array['total'];
     $sumFinder = new SumFinderClass($input);
     $this->assertEquals($total, $sumFinder->findSum());
 }
示例#6
0
 function get_scraper($service_agenda, $periodo, $curso)
 {
     $scraper = new \Scraper();
     $response = $this->get_html($service_agenda, $periodo, $curso);
     if (!$response['status']) {
         //se presenta un error en la consulta del html
         return $response;
     }
     $result = $scraper->execute('#contenido table tr td table tr td', $response['content']);
     //contenido
     if ($result == null) {
         //vuelve a intentar con otro formato
         $result = $scraper->execute('#contenido table tr td', $response['content']);
         //contenido
         return array("status" => true, "result" => $result[0], "code" => 200);
     } else {
         //OK
         return array("status" => true, "result" => $result, "code" => 200);
     }
 }
 function __construct($url, $twitter = null)
 {
     parent::__construct($url);
     $this->twitter = $twitter;
     $this->title = $this->crawler->filter('title')->first()->text();
     $this->rss = $this->crawler->filter('link[type="application/rss+xml"]')->first()->attr('href');
     if ($twitter) {
         $twitterInfo = new twitterAccountScraper($twitter);
         $this->description = $twitterInfo->description();
         $this->image = $twitterInfo->profilePic();
     }
 }
 public function run()
 {
     // Google Search API default parameters
     $baseURL = 'http://ajax.googleapis.com/ajax/services/search/web?';
     $defaultParams = array('v' => '1.0', 'rsz' => 'large', 'q' => $this->search);
     if (!empty($this->save)) {
         $defaultParams['save'] = $this->save;
     }
     if (!empty($this->language)) {
         $defaultParams['hl'] = $this->language;
     }
     if (!empty($this->country)) {
         if ($this->country === true) {
             if (!empty($this->language)) {
                 $defaultParams['gl'] = $this->language;
             }
         } else {
             $defaultParams['gl'] = $this->country;
         }
     }
     // iterate over pages and try to find match with host
     $perPage = 8;
     $this->result = false;
     for ($p = 0; $p < 7; $p++) {
         $start = $p * $perPage;
         // build request url
         $url = $baseURL . http_build_query($defaultParams, '', '&') . '&start=' . $start;
         $response = Scraper::scrape($url);
         if (!is_string($response)) {
             continue;
         }
         $response = json_decode($response);
         // try to find result set
         if (!is_object($response)) {
             continue;
         }
         if (!isset($response->responseData->results)) {
             continue;
         }
         // find hostname in result set
         foreach ($response->responseData->results as $index => $result) {
             if (!preg_match('/' . preg_quote($this->TestGroup->host, '/') . '.*/', $result->url)) {
                 continue;
             }
             $this->result = $start + $index + 1;
             break 2;
         }
     }
     return $this->result;
 }
 public function Geocode($address)
 {
     //https://developers.google.com/maps/documentation/geocoding/index
     $address = str_replace(" ", "+", $address);
     //remove spaces
     $address = str_replace("&", "and", $address);
     //remove &
     $url = 'https://maps.googleapis.com/maps/api/geocode/xml?address=' . $address . '&key=' . $this->API_key;
     // single curl request
     $fetch = parent::SingleCurl($url);
     //$fetch[EXE] is html, $fetch[INF] is info about scrape
     // wait for 0.2 seconds
     usleep(200000);
     return $fetch['BODY'];
 }
 public function run()
 {
     $response = Scraper::scrape($this->url);
     if (preg_match_all($this->regexp, $response, $found)) {
         if (isset($found['match'])) {
             $this->result = $found['match'][0];
         } else {
             $this->result = $found[1][0];
         }
         $this->result = preg_replace('@[.,]@', '', $this->result);
         // type conversion
         if (preg_match('@^-?\\s?\\d+$@', $this->result)) {
             $this->result = (double) $this->result;
         }
     }
     return $this->result;
 }
 function scrape()
 {
     $riga = 2;
     $i = 0;
     do {
         //scorre le righe
         $filter = "//table[2]/tr[{$riga}]/th[1]";
         parent::XPathFilter($filter);
         $tmp = parent::toString('txt');
         if ($tmp == "") {
             continue;
         }
         $this->siti[$i] = $tmp;
         $filter = "//table[2]/tr[{$riga}]/th[2]";
         parent::XPathFilter($filter);
         $tmp = parent::toString('txt');
         $this->nomi[$i] = $tmp;
         $riga++;
         $i++;
     } while ($tmp != "");
     $out = "";
     foreach ($this->nomi as $k => $v) {
         $a = trim($this->nomi[$k]);
         $b = trim($this->siti[$k]);
         $out .= "{\"nome\" : \"{$a}\",\"sito\" : \"{$b}\"},";
     }
     $out = substr($out, 0, -1);
     return "[" . $out . "]";
 }
示例#12
0
<?php

define('SCRAPER_ROOT_PATH', __DIR__);
require_once 'models/Scraper.php';
// Let's retrieve and save the list of films
// Taken from Filmaffinity TOP FA (just films from this year)
$scraper = new Scraper();
$scraper->scrapeLastFilms();
示例#13
0
        $string = preg_replace('/\\s\\s+/', ' ', $string);
        return $string;
    }
    private function startsWithPartofWord($name, $word)
    {
        // The general rule is that the first word of the name may start with
        // part of the SKU, so we extract the first word from the name and look
        // for it in the SKU.  The string "$name" should be cleaned before
        // calling this function.
        $firstWord = current(explode(" ", $name));
        if (stripos($word, $firstWord) == 0) {
            return true;
        }
        return false;
    }
    private function deleteFirstWord($string)
    {
        $elements = explode(" ", $string);
        array_shift($elements);
        $shortString = trim(implode(" ", $elements));
        // Also trim off any starting hyphen and spaces.
        $shortString = trim($shortString, "-");
        $shortString = trim($shortString);
        return $shortString;
    }
}
if ($argc < 4) {
    die("Usage: php scraper.php inputFile outputFile errorFile");
}
$scraper = new Scraper($argv[1], $argv[2], $argv[3]);
$scraper->scrape();
示例#14
0
<?php

require_once 'Scraper.php';
print_r(Scraper::start());
示例#15
0
 /**
  * To respond against any request we may need to prepare out data and check data integrity
  */
 protected function prepareData()
 {
     $Scraper = new Scraper();
     $data = $Scraper->scrapData($this->url);
     return $data;
 }
示例#16
0
        <select onchange="setOption(\'max\')" id="max">
            <option value="1" ' . ($max === '1' ? 'selected="selected"' : '') . '>1</option>
            <option value="2" ' . ($max === '2' ? 'selected="selected"' : '') . '>2</option>
            <option value="3" ' . ($max === '3' ? 'selected="selected"' : '') . '>3</option>
            <option value="4" ' . ($max === '4' ? 'selected="selected"' : '') . '>4</option>
            <option value="30" ' . ($max === '30' ? 'selected="selected"' : '') . '>All</option>
        </select>
    </div>';
echo '</div>';
echo '<iframe style="display: none;width: 560px;height:315px;" id="frame" allowfullscreen></iframe><div id="frame-hover"></div>';
$db = new Datastore();
$db->tables();
$db->query("SELECT id, name, channel, type FROM channels ORDER BY name");
$channels = $db->fetch();
unset($db);
$ws = new Scraper();
foreach ($channels as $channel) {
    $ws->addChannel($channel['channel'], $sort, $channel['type']);
}
$ws->fetch();
foreach ($channels as $channel) {
    $counter = 0;
    $maxVideos = 1;
    $videos = $ws->fetchChannel($channel['channel'], !empty($auto));
    echo "<div class=\"channel\"><button class=\"channel-button\" onclick=\"hideshow('channel-{$channel['id']}', 200);\">{$channel['name']}</button></div>";
    echo "<div id=\"channel-{$channel['id']}\">";
    foreach ($videos as $video) {
        // User already watched video
        if (!empty($hide) && !empty($watched) && strpos($watched, '|' . $video['id']) !== false) {
            continue;
        }
示例#17
0
if (empty($_GET['url']) && empty($_GET['xml'])) {
    $incorrectFlag = true;
}
if ($incorrectFlag) {
    $html = <<<EOD
<br />
<div id="container">
<div id="response"><pre><p>Please ensure you've supplied both xpath AND a url OR a full XML.</p></pre>
</div>
</div>
EOD;
    echo $html;
    exit;
}
include_once "class.Scraper.php";
$scraper = new Scraper();
if (!empty($_GET['url'])) {
    $scraper->setUpScraper($_GET['url'], $_GET['xpath']);
}
if (!empty($_GET['xml'])) {
    $xmlFromString = $scraper->getXMLFromString($_GET['xml']);
    $scraper->setPageXML($xmlFromString);
    $scraper->setXpath($_GET['xpath']);
}
$queryResponse = $scraper->getResponse();
$output = "";
$output .= <<<EOD
<h3>Results of xpath query</h3>
<p>Here's the result of the xpath query you ran. And as a bonus, the entire XML tree is <a href="#xml">below</a> too. </p>
EOD;
$output .= <<<EOD
 public function __construct()
 {
     parent::__construct();
     $this->setCategoryId(36);
 }
示例#19
0
<?php

include "library/Scraper.php";
$scraper = new Scraper();
$scrapData = $scraper->scrapData('http://www.dsebd.org');
var_dump($scrapData);
示例#20
0
 public static function scrape($url = null)
 {
     $s = new Scraper($url);
     return $s->read();
 }
示例#21
0
        curl_setopt($ch, CURLOPT_PROXYUSERPWD, $userpass);
    }
    curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
    $result = curl_exec($ch);
    if ($result === false) {
        $result = curl_error($ch);
    }
    curl_close($ch);
    return $result;
}
?>
    <?php 
if (isset($_POST['btnsubmit'])) {
    require_once 'scraper.php';
    $scraper = new Scraper();
    $url = $scraper->getPagePost($_POST['url']);
    ?>
  

     <div style="overflow-x: hidden; overflow-y: scroll; height:300px; width:100%" >
          <?php 
    echo $url;
    ?>
     </div>
     <?php 
}
?>
  </div>

</body>
示例#22
0
<?php

require_once 'vendor/autoload.php';
require_once './classes/Scraper.class.php';
$default_url = "http://www.sainsburys.co.uk/webapp/wcs/stores/servlet/CategoryDisplay?listView=true&orderBy=FAVOURITES_FIRST&parent_category_rn=12518&top_category=12518&langId=44&beginIndex=0&pageSize=20&catalogId=10137&searchTerm=&categoryId=185749&listId=&storeId=10151&promotionId=#langId=44&storeId=10151&catalogId=10137&categoryId=185749&parent_category_rn=12518&top_category=12518&pageSize=20&orderBy=FAVOURITES_FIRST&searchTerm=&beginIndex=0&hideFilters=true";
$scraper = new Scraper($default_url);
$scraped_products = $scraper->scrape();
print_r($scraper->construct_json($scraped_products, $pretty = True));
示例#23
0
<?php

//scritto da Davide Quadrelli
header("Content-Type:html;charset=UTF-8");
require_once 'include/Scraper.php';
$toret = array();
if (isset($_GET['url'])) {
    $scraper = new Scraper();
    $scraper->loadPage($_GET["url"], true);
    if (strpos($_GET['url'], "dlib")) {
        //pagina di d-lib
        $scraper->XPathFilter("html/body[1]/form/table[3]/tr[1]/td[1]/table[5]/tr[1]/td[1]/table[1]/tr[1]/td[2]/node()");
        $toret[0] = "form1_table3_tr1_td1_table5_tr1_td1_table1_tr1_td2_";
    } else {
        if (strpos($_GET['url'], "unibo")) {
            //articolo di almajournal
            $toret[0] = "div1_div3_div2_";
            /*$scraper->XPathFilter("//div[@id=\"articleTitle\"] | //div[@id=\"authorString\"] | //div[@id=\"articleAbstract\"] | //div[@id=\"articleSubject\"]
            	 | //div[@id=\"articleFullText\"] | //div[@id=\"authorString\"] | //div[@id=\"articleCitations\"] | //a[@id=\"pub-id::doi\"]");*/
            $scraper->XPathFilter("//div[@id=\"main\"]/node()");
            if ($scraper->toString() == "") {
                //è un sito unibo ma non un articolo
                $toret[0] = "body1_";
                $scraper->XPathFilter("//body");
            }
        } else {
            $toret[0] = "body1_";
            $scraper->XPathFilter("//body");
        }
    }
    $toret[1] = $scraper->toString();
示例#24
0
        //
        curl_setopt($ch, CURLOPT_AUTOREFERER, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        //
        curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, true);
        curl_setopt($ch, CURLOPT_CURLOPT_MAXREDIRS, 10);
        $this->result = curl_exec($ch);
        curl_close($ch);
    }
    private function exec_FGC()
    {
        $this->result = file_get_contents($this->url);
    }
    public function run()
    {
        switch (CURL_ENABLED) {
            case true:
                $this->exec_CURL();
                break;
            case false:
                $this->exec_FGC();
                break;
        }
    }
}
$url = $_GET['url'];
$scraper = new Scraper($url);
$scraper->run();
echo '<pre>';
print_r($scraper->result);
echo '</pre>';
 function bestemmie_are_coming()
 {
     //$pattern='/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u';
     $pattern = '/[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F]/u';
     $content = file_get_contents($this->url);
     //elimino caratteri non utf-8
     $content = mb_convert_encoding($content, 'UTF-8', 'UTF-8');
     //elimino caratteri non stampabili
     $new = preg_replace($pattern, '', $content);
     $this->to_clean = time() . ".tmp";
     $pagina = fopen($this->to_clean, "w");
     fwrite($pagina, $new);
     fclose($pagina);
     $base = $_SERVER['SERVER_NAME'];
     $last = $_SERVER['PHP_SELF'];
     $last = parent::getInitialURL($last);
     parent::loadPage("http://" . $base . $last . $this->to_clean);
 }
示例#26
0
<?php

/*
WHM Dynamic DNS Updater v2.1.0
By Silas Montgomery
Website: http://reticent.net
Email: nomsalis@reticent.net)
*/
// Classes
require_once "class.ZoneUpdater.php";
require_once "class.Scraper.php";
require_once "class.Logger.php";
// Configuration
require_once "whmDynDns.config.php";
// Logic
try {
    $scraper = new Scraper();
    $scraper->SetUrls($websites);
    if ($ip = $scraper->GetIp()) {
        $updater = new ZoneUpdater();
        $updater->SetUsername($username);
        $updater->SetPassword($password);
        $updater->SetUrl($whmUrl);
        $updater->SetIp($ip);
        $updater->SetZones($zones);
        $updater->Update();
    }
} catch (exception $e) {
    Logger::Write($e);
}
示例#27
0
<?php

if (!empty($_POST)) {
    session_start();
    if (isset($_POST['form']) && (!isset($_SESSION['form']) || $_SESSION['form'] !== $_POST['form'])) {
        $_SESSION['form'] = $_POST['form'];
        if (!empty($_POST['name']) && !empty($_POST['id']) && !empty($_POST['type'])) {
            $name = $_POST['name'];
            $id = $_POST['id'];
            $type = $_POST['type'] === 'user' ? 'user' : 'channel';
            if (strlen($name) <= 20 && strlen($id) <= 48) {
                $ws = new Scraper();
                if ($ws->addChannel($id, 'new', $type)) {
                    $ws->fetch();
                    $response = $ws->fetchChannel($id);
                    unset($ws);
                    if (!empty($response)) {
                        $db = new Datastore();
                        $query = 'INSERT INTO channels VALUES
                                  (
                                      NULL,
                                      ?,
                                      ?,
                                      ?
                                   )';
                        $options = array('sss', &$name, &$id, &$type);
                        $db->query($query, $options);
                        unset($db);
                        echo 'Successfully added channel ' . htmlentities($name);
                    } else {
                        echo 'Channel ' . htmlentities($name) . ' does not exist or the page is down';
示例#28
0
$script_name = $argv[0];
$input_file = $argv[1];
$website = $argv[2];
// Define it as constant
// TODO : Remove this constant
define("SITE", $website);
define("GOOGLE_BASE_URL", "https://www.google.com/search?&q=");
define("GOOGLE_SEARCH_URL", "https://www.googleapis.com/customsearch/v1element?" . "key=AIzaSyCVAXiUzRYsML1Pv6RwSG1gunmMikTzQqY&" . "rsz=filtered_cse&" . "num=10&" . "hl=en&" . "prettyPrint=false&" . "source=gcsc&" . "gss=.com&" . "sig=ee93f9aae9c9e9dba5eea831d506e69a&" . "cx=000351285113061488967:p1lh-gcxv08&" . "q=_QUERY&" . "sort=&" . "googlehost=www.google.com&" . "oq=_QUERY&" . "gs_l=partner.12...25371.25371.0.26346.1.1.0.0.0.0.170.170.0j1.1.0.gsnos%2Cn%3D13...0.1981j3853693j3..1ac.1.25.partner..1.0.0.Wsa_5yXJf84&" . "callback=google.search.Search.apiary15963");
//defining Output CSV file options
$csv['dir'] = OUTPUT_DIR;
$csv['file'] = SITE . "_results_" . date("Y-m-d_H-i-s", time()) . ".csv";
$csv['columns'] = array("Page Title", "Query", "Google URL", SITE . " URL");
//Log file based on running script , website and timestamp
$log_file = $script_name . "_" . SITE . "_" . date("Y-m-d_H-i-s", time()) . ".log";
//Scraper object
$scraper = new Scraper(USERAGENT, MIN_SLEEP_TIME, MAX_SLEEP_TIME, $input_file, $csv, $log_file);
//Start crawling
$scraper->crawl();
//Finish crawling
fwrite($scraper->log, "[END]\r\n\r\n");
echo "\nCSV file created: " . $csv['dir'] . "/" . $csv['file'] . "\n";
echo "\nDONE\n";
/**
*@desc
* Scraper class
*/
class Scraper
{
    public function __construct($useragent, $min_sleep_time, $max_sleep_time, $input_file, $csv, $log_file)
    {
        //CURL
 public function __destruct()
 {
     parent::__destruct();
 }
    public function reportHttpActivity($url)
    {
        usleep(1000000);
        echo $url . PHP_EOL;
    }
}

date_default_timezone_set('America/New_York');

// Initialize the application and bootstrap the database adapter
defined('APPLICATION_PATH')
    || define('APPLICATION_PATH', realpath(dirname(__FILE__) . '/../application'));
defined('APPLICATION_ENV')
    || define('APPLICATION_ENV', 'development');
require_once 'Zend/Application.php';
$application = new Zend_Application(
    APPLICATION_ENV,
    APPLICATION_PATH . '/configs/application.ini'
);
$bootstrap = $application->getBootstrap();
$bootstrap->bootstrap('db');
$dbAdapter = $bootstrap->getResource('db');

$scraper = new Scraper($dbAdapter->getConnection(), 'http://n4.nabble.com/Zend-Framework-Community-f634137.html');
$scraper->getUserId('Alex');

$scraper->start();

// generally speaking, this script will be run from the command line
return true;