/** * Scrape the given topic url for all the posts in a topic and return as xml. */ private function scrape_topic($link) { $scraper = new Scraper($link); $scraper->run(); $topic_scraper = new GoogleGroupsTopicScraper($scraper->html); $topic = $topic_scraper->run(); $i = 0; $xml = ''; if (is_array($topic)) { foreach ($topic as $detail) { $xml .= " <post idx=\"{$i}\">\n"; $xml .= ' <author>' . $detail['author'] . "</author>\n"; $xml .= ' <email>' . $detail['email'] . "</email>\n"; $xml .= ' <date>' . $detail['date'] . "</date>\n"; $xml .= ' <timestamp>' . $detail['timestamp'] . "</timestamp>\n"; $xml .= " <body>\n"; $xml .= "<![CDATA[\n" . $detail['body'] . "\n]]>\n"; $xml .= " </body>\n"; $xml .= " </post>\n"; $i++; } } else { print "ERROR: bad topic (url={$link})\n"; } return $xml; }
/** * Execute the console command. * * @return void */ public function fire() { // message $data = array('type' => $this->argument('type'), 'id' => $this->argument('id'), 'date' => $this->argument('date')); // fire $scraper = new Scraper(); $scraper->execute($data); }
protected function execute(InputInterface $input, OutputInterface $output) { $default_url = "http://www.sainsburys.co.uk/webapp/wcs/stores/servlet/CategoryDisplay?listView=true&orderBy=FAVOURITES_FIRST&parent_category_rn=12518&top_category=12518&langId=44&beginIndex=0&pageSize=20&catalogId=10137&searchTerm=&categoryId=185749&listId=&storeId=10151&promotionId=#langId=44&storeId=10151&catalogId=10137&categoryId=185749&parent_category_rn=12518&top_category=12518&pageSize=20&orderBy=FAVOURITES_FIRST&searchTerm=&beginIndex=0&hideFilters=true"; $scraper = new Scraper($default_url); $scraped_products = $scraper->scrape(); if ($input->getOption('prettyprint')) { $pretty_print = True; } else { $pretty_print = False; } $json = $scraper->construct_json($scraped_products, $pretty = $pretty_print); $output->writeln($json); }
public function run() { $params = array('q' => $this->q, 'page' => 1, 'rpp' => 100); if (!empty($this->since)) { $params['since'] = date('Y-m-d', strtotime($this->since)); } if (!empty($this->locale)) { $params['locale'] = $this->locale; } if (!empty($this->geocode)) { $params['geocode'] = $this->geocode; } for ($i = 1; $i < 10; $i++) { $params['page'] = $i; $this->url = 'http://search.twitter.com/search.json?' . http_build_query($params, '', '&'); $response = Scraper::scrape($this->url); if ($json = json_decode($response)) { if (isset($json->total)) { $this->result = $json->total; break; } } } return $this->result; }
public function testFindSum() { $json = Scraper::start(); $array = json_decode($json, true); //print_r($array['total']); $input = $array['results']; $total = $array['total']; $sumFinder = new SumFinderClass($input); $this->assertEquals($total, $sumFinder->findSum()); }
function get_scraper($service_agenda, $periodo, $curso) { $scraper = new \Scraper(); $response = $this->get_html($service_agenda, $periodo, $curso); if (!$response['status']) { //se presenta un error en la consulta del html return $response; } $result = $scraper->execute('#contenido table tr td table tr td', $response['content']); //contenido if ($result == null) { //vuelve a intentar con otro formato $result = $scraper->execute('#contenido table tr td', $response['content']); //contenido return array("status" => true, "result" => $result[0], "code" => 200); } else { //OK return array("status" => true, "result" => $result, "code" => 200); } }
function __construct($url, $twitter = null) { parent::__construct($url); $this->twitter = $twitter; $this->title = $this->crawler->filter('title')->first()->text(); $this->rss = $this->crawler->filter('link[type="application/rss+xml"]')->first()->attr('href'); if ($twitter) { $twitterInfo = new twitterAccountScraper($twitter); $this->description = $twitterInfo->description(); $this->image = $twitterInfo->profilePic(); } }
public function run() { // Google Search API default parameters $baseURL = 'http://ajax.googleapis.com/ajax/services/search/web?'; $defaultParams = array('v' => '1.0', 'rsz' => 'large', 'q' => $this->search); if (!empty($this->save)) { $defaultParams['save'] = $this->save; } if (!empty($this->language)) { $defaultParams['hl'] = $this->language; } if (!empty($this->country)) { if ($this->country === true) { if (!empty($this->language)) { $defaultParams['gl'] = $this->language; } } else { $defaultParams['gl'] = $this->country; } } // iterate over pages and try to find match with host $perPage = 8; $this->result = false; for ($p = 0; $p < 7; $p++) { $start = $p * $perPage; // build request url $url = $baseURL . http_build_query($defaultParams, '', '&') . '&start=' . $start; $response = Scraper::scrape($url); if (!is_string($response)) { continue; } $response = json_decode($response); // try to find result set if (!is_object($response)) { continue; } if (!isset($response->responseData->results)) { continue; } // find hostname in result set foreach ($response->responseData->results as $index => $result) { if (!preg_match('/' . preg_quote($this->TestGroup->host, '/') . '.*/', $result->url)) { continue; } $this->result = $start + $index + 1; break 2; } } return $this->result; }
public function Geocode($address) { //https://developers.google.com/maps/documentation/geocoding/index $address = str_replace(" ", "+", $address); //remove spaces $address = str_replace("&", "and", $address); //remove & $url = 'https://maps.googleapis.com/maps/api/geocode/xml?address=' . $address . '&key=' . $this->API_key; // single curl request $fetch = parent::SingleCurl($url); //$fetch[EXE] is html, $fetch[INF] is info about scrape // wait for 0.2 seconds usleep(200000); return $fetch['BODY']; }
public function run() { $response = Scraper::scrape($this->url); if (preg_match_all($this->regexp, $response, $found)) { if (isset($found['match'])) { $this->result = $found['match'][0]; } else { $this->result = $found[1][0]; } $this->result = preg_replace('@[.,]@', '', $this->result); // type conversion if (preg_match('@^-?\\s?\\d+$@', $this->result)) { $this->result = (double) $this->result; } } return $this->result; }
function scrape() { $riga = 2; $i = 0; do { //scorre le righe $filter = "//table[2]/tr[{$riga}]/th[1]"; parent::XPathFilter($filter); $tmp = parent::toString('txt'); if ($tmp == "") { continue; } $this->siti[$i] = $tmp; $filter = "//table[2]/tr[{$riga}]/th[2]"; parent::XPathFilter($filter); $tmp = parent::toString('txt'); $this->nomi[$i] = $tmp; $riga++; $i++; } while ($tmp != ""); $out = ""; foreach ($this->nomi as $k => $v) { $a = trim($this->nomi[$k]); $b = trim($this->siti[$k]); $out .= "{\"nome\" : \"{$a}\",\"sito\" : \"{$b}\"},"; } $out = substr($out, 0, -1); return "[" . $out . "]"; }
<?php define('SCRAPER_ROOT_PATH', __DIR__); require_once 'models/Scraper.php'; // Let's retrieve and save the list of films // Taken from Filmaffinity TOP FA (just films from this year) $scraper = new Scraper(); $scraper->scrapeLastFilms();
$string = preg_replace('/\\s\\s+/', ' ', $string); return $string; } private function startsWithPartofWord($name, $word) { // The general rule is that the first word of the name may start with // part of the SKU, so we extract the first word from the name and look // for it in the SKU. The string "$name" should be cleaned before // calling this function. $firstWord = current(explode(" ", $name)); if (stripos($word, $firstWord) == 0) { return true; } return false; } private function deleteFirstWord($string) { $elements = explode(" ", $string); array_shift($elements); $shortString = trim(implode(" ", $elements)); // Also trim off any starting hyphen and spaces. $shortString = trim($shortString, "-"); $shortString = trim($shortString); return $shortString; } } if ($argc < 4) { die("Usage: php scraper.php inputFile outputFile errorFile"); } $scraper = new Scraper($argv[1], $argv[2], $argv[3]); $scraper->scrape();
<?php require_once 'Scraper.php'; print_r(Scraper::start());
/** * To respond against any request we may need to prepare out data and check data integrity */ protected function prepareData() { $Scraper = new Scraper(); $data = $Scraper->scrapData($this->url); return $data; }
<select onchange="setOption(\'max\')" id="max"> <option value="1" ' . ($max === '1' ? 'selected="selected"' : '') . '>1</option> <option value="2" ' . ($max === '2' ? 'selected="selected"' : '') . '>2</option> <option value="3" ' . ($max === '3' ? 'selected="selected"' : '') . '>3</option> <option value="4" ' . ($max === '4' ? 'selected="selected"' : '') . '>4</option> <option value="30" ' . ($max === '30' ? 'selected="selected"' : '') . '>All</option> </select> </div>'; echo '</div>'; echo '<iframe style="display: none;width: 560px;height:315px;" id="frame" allowfullscreen></iframe><div id="frame-hover"></div>'; $db = new Datastore(); $db->tables(); $db->query("SELECT id, name, channel, type FROM channels ORDER BY name"); $channels = $db->fetch(); unset($db); $ws = new Scraper(); foreach ($channels as $channel) { $ws->addChannel($channel['channel'], $sort, $channel['type']); } $ws->fetch(); foreach ($channels as $channel) { $counter = 0; $maxVideos = 1; $videos = $ws->fetchChannel($channel['channel'], !empty($auto)); echo "<div class=\"channel\"><button class=\"channel-button\" onclick=\"hideshow('channel-{$channel['id']}', 200);\">{$channel['name']}</button></div>"; echo "<div id=\"channel-{$channel['id']}\">"; foreach ($videos as $video) { // User already watched video if (!empty($hide) && !empty($watched) && strpos($watched, '|' . $video['id']) !== false) { continue; }
if (empty($_GET['url']) && empty($_GET['xml'])) { $incorrectFlag = true; } if ($incorrectFlag) { $html = <<<EOD <br /> <div id="container"> <div id="response"><pre><p>Please ensure you've supplied both xpath AND a url OR a full XML.</p></pre> </div> </div> EOD; echo $html; exit; } include_once "class.Scraper.php"; $scraper = new Scraper(); if (!empty($_GET['url'])) { $scraper->setUpScraper($_GET['url'], $_GET['xpath']); } if (!empty($_GET['xml'])) { $xmlFromString = $scraper->getXMLFromString($_GET['xml']); $scraper->setPageXML($xmlFromString); $scraper->setXpath($_GET['xpath']); } $queryResponse = $scraper->getResponse(); $output = ""; $output .= <<<EOD <h3>Results of xpath query</h3> <p>Here's the result of the xpath query you ran. And as a bonus, the entire XML tree is <a href="#xml">below</a> too. </p> EOD; $output .= <<<EOD
public function __construct() { parent::__construct(); $this->setCategoryId(36); }
<?php include "library/Scraper.php"; $scraper = new Scraper(); $scrapData = $scraper->scrapData('http://www.dsebd.org'); var_dump($scrapData);
public static function scrape($url = null) { $s = new Scraper($url); return $s->read(); }
curl_setopt($ch, CURLOPT_PROXYUSERPWD, $userpass); } curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $data); $result = curl_exec($ch); if ($result === false) { $result = curl_error($ch); } curl_close($ch); return $result; } ?> <?php if (isset($_POST['btnsubmit'])) { require_once 'scraper.php'; $scraper = new Scraper(); $url = $scraper->getPagePost($_POST['url']); ?> <div style="overflow-x: hidden; overflow-y: scroll; height:300px; width:100%" > <?php echo $url; ?> </div> <?php } ?> </div> </body>
<?php require_once 'vendor/autoload.php'; require_once './classes/Scraper.class.php'; $default_url = "http://www.sainsburys.co.uk/webapp/wcs/stores/servlet/CategoryDisplay?listView=true&orderBy=FAVOURITES_FIRST&parent_category_rn=12518&top_category=12518&langId=44&beginIndex=0&pageSize=20&catalogId=10137&searchTerm=&categoryId=185749&listId=&storeId=10151&promotionId=#langId=44&storeId=10151&catalogId=10137&categoryId=185749&parent_category_rn=12518&top_category=12518&pageSize=20&orderBy=FAVOURITES_FIRST&searchTerm=&beginIndex=0&hideFilters=true"; $scraper = new Scraper($default_url); $scraped_products = $scraper->scrape(); print_r($scraper->construct_json($scraped_products, $pretty = True));
<?php //scritto da Davide Quadrelli header("Content-Type:html;charset=UTF-8"); require_once 'include/Scraper.php'; $toret = array(); if (isset($_GET['url'])) { $scraper = new Scraper(); $scraper->loadPage($_GET["url"], true); if (strpos($_GET['url'], "dlib")) { //pagina di d-lib $scraper->XPathFilter("html/body[1]/form/table[3]/tr[1]/td[1]/table[5]/tr[1]/td[1]/table[1]/tr[1]/td[2]/node()"); $toret[0] = "form1_table3_tr1_td1_table5_tr1_td1_table1_tr1_td2_"; } else { if (strpos($_GET['url'], "unibo")) { //articolo di almajournal $toret[0] = "div1_div3_div2_"; /*$scraper->XPathFilter("//div[@id=\"articleTitle\"] | //div[@id=\"authorString\"] | //div[@id=\"articleAbstract\"] | //div[@id=\"articleSubject\"] | //div[@id=\"articleFullText\"] | //div[@id=\"authorString\"] | //div[@id=\"articleCitations\"] | //a[@id=\"pub-id::doi\"]");*/ $scraper->XPathFilter("//div[@id=\"main\"]/node()"); if ($scraper->toString() == "") { //è un sito unibo ma non un articolo $toret[0] = "body1_"; $scraper->XPathFilter("//body"); } } else { $toret[0] = "body1_"; $scraper->XPathFilter("//body"); } } $toret[1] = $scraper->toString();
// curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, true); curl_setopt($ch, CURLOPT_CURLOPT_MAXREDIRS, 10); $this->result = curl_exec($ch); curl_close($ch); } private function exec_FGC() { $this->result = file_get_contents($this->url); } public function run() { switch (CURL_ENABLED) { case true: $this->exec_CURL(); break; case false: $this->exec_FGC(); break; } } } $url = $_GET['url']; $scraper = new Scraper($url); $scraper->run(); echo '<pre>'; print_r($scraper->result); echo '</pre>';
function bestemmie_are_coming() { //$pattern='/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u'; $pattern = '/[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F]/u'; $content = file_get_contents($this->url); //elimino caratteri non utf-8 $content = mb_convert_encoding($content, 'UTF-8', 'UTF-8'); //elimino caratteri non stampabili $new = preg_replace($pattern, '', $content); $this->to_clean = time() . ".tmp"; $pagina = fopen($this->to_clean, "w"); fwrite($pagina, $new); fclose($pagina); $base = $_SERVER['SERVER_NAME']; $last = $_SERVER['PHP_SELF']; $last = parent::getInitialURL($last); parent::loadPage("http://" . $base . $last . $this->to_clean); }
<?php /* WHM Dynamic DNS Updater v2.1.0 By Silas Montgomery Website: http://reticent.net Email: nomsalis@reticent.net) */ // Classes require_once "class.ZoneUpdater.php"; require_once "class.Scraper.php"; require_once "class.Logger.php"; // Configuration require_once "whmDynDns.config.php"; // Logic try { $scraper = new Scraper(); $scraper->SetUrls($websites); if ($ip = $scraper->GetIp()) { $updater = new ZoneUpdater(); $updater->SetUsername($username); $updater->SetPassword($password); $updater->SetUrl($whmUrl); $updater->SetIp($ip); $updater->SetZones($zones); $updater->Update(); } } catch (exception $e) { Logger::Write($e); }
<?php if (!empty($_POST)) { session_start(); if (isset($_POST['form']) && (!isset($_SESSION['form']) || $_SESSION['form'] !== $_POST['form'])) { $_SESSION['form'] = $_POST['form']; if (!empty($_POST['name']) && !empty($_POST['id']) && !empty($_POST['type'])) { $name = $_POST['name']; $id = $_POST['id']; $type = $_POST['type'] === 'user' ? 'user' : 'channel'; if (strlen($name) <= 20 && strlen($id) <= 48) { $ws = new Scraper(); if ($ws->addChannel($id, 'new', $type)) { $ws->fetch(); $response = $ws->fetchChannel($id); unset($ws); if (!empty($response)) { $db = new Datastore(); $query = 'INSERT INTO channels VALUES ( NULL, ?, ?, ? )'; $options = array('sss', &$name, &$id, &$type); $db->query($query, $options); unset($db); echo 'Successfully added channel ' . htmlentities($name); } else { echo 'Channel ' . htmlentities($name) . ' does not exist or the page is down';
$script_name = $argv[0]; $input_file = $argv[1]; $website = $argv[2]; // Define it as constant // TODO : Remove this constant define("SITE", $website); define("GOOGLE_BASE_URL", "https://www.google.com/search?&q="); define("GOOGLE_SEARCH_URL", "https://www.googleapis.com/customsearch/v1element?" . "key=AIzaSyCVAXiUzRYsML1Pv6RwSG1gunmMikTzQqY&" . "rsz=filtered_cse&" . "num=10&" . "hl=en&" . "prettyPrint=false&" . "source=gcsc&" . "gss=.com&" . "sig=ee93f9aae9c9e9dba5eea831d506e69a&" . "cx=000351285113061488967:p1lh-gcxv08&" . "q=_QUERY&" . "sort=&" . "googlehost=www.google.com&" . "oq=_QUERY&" . "gs_l=partner.12...25371.25371.0.26346.1.1.0.0.0.0.170.170.0j1.1.0.gsnos%2Cn%3D13...0.1981j3853693j3..1ac.1.25.partner..1.0.0.Wsa_5yXJf84&" . "callback=google.search.Search.apiary15963"); //defining Output CSV file options $csv['dir'] = OUTPUT_DIR; $csv['file'] = SITE . "_results_" . date("Y-m-d_H-i-s", time()) . ".csv"; $csv['columns'] = array("Page Title", "Query", "Google URL", SITE . " URL"); //Log file based on running script , website and timestamp $log_file = $script_name . "_" . SITE . "_" . date("Y-m-d_H-i-s", time()) . ".log"; //Scraper object $scraper = new Scraper(USERAGENT, MIN_SLEEP_TIME, MAX_SLEEP_TIME, $input_file, $csv, $log_file); //Start crawling $scraper->crawl(); //Finish crawling fwrite($scraper->log, "[END]\r\n\r\n"); echo "\nCSV file created: " . $csv['dir'] . "/" . $csv['file'] . "\n"; echo "\nDONE\n"; /** *@desc * Scraper class */ class Scraper { public function __construct($useragent, $min_sleep_time, $max_sleep_time, $input_file, $csv, $log_file) { //CURL
public function __destruct() { parent::__destruct(); }
public function reportHttpActivity($url) { usleep(1000000); echo $url . PHP_EOL; } } date_default_timezone_set('America/New_York'); // Initialize the application and bootstrap the database adapter defined('APPLICATION_PATH') || define('APPLICATION_PATH', realpath(dirname(__FILE__) . '/../application')); defined('APPLICATION_ENV') || define('APPLICATION_ENV', 'development'); require_once 'Zend/Application.php'; $application = new Zend_Application( APPLICATION_ENV, APPLICATION_PATH . '/configs/application.ini' ); $bootstrap = $application->getBootstrap(); $bootstrap->bootstrap('db'); $dbAdapter = $bootstrap->getResource('db'); $scraper = new Scraper($dbAdapter->getConnection(), 'http://n4.nabble.com/Zend-Framework-Community-f634137.html'); $scraper->getUserId('Alex'); $scraper->start(); // generally speaking, this script will be run from the command line return true;