function crawlAll($tname, $styear) { for ($year = $styear; $year <= 2015; $year++) { for ($period = 1; $period <= 4; $period++) { crawl($tname, $year, $period, null); } } }
function process_message($msg) { global $debug; global $useragent; $json = $msg->body; $ob = json_decode($json); $id = $ob->id; print "ID: {$id}\n"; crawl($id); $msg->delivery_info['channel']->basic_ack($msg->delivery_info['delivery_tag']); }
function crawl($dir) { $dir_handle = @opendir($dir) or die("Unable to open directory"); echo "Directory Listing: {$dir}<br/>"; while ($file = readdir($dir_handle)) { //$file = rawurlencode($file); //$url = str_replace('+' , '%20' , $file); if ($file != ".." && $file != ".") { echo "{$file}<br>"; if (strpos($file, "~") == strlen($file) - 1) { echo "<b>{$file}</b><br>"; unlink($dir . "/" . $file); } if (is_dir($dir . "/" . $file)) { crawl($dir . "/" . $file); } } } closedir($dir_handle); }
function process_message($msg) { global $debug; global $useragent; $json = $msg->body; $ob = json_decode($json); $id = $ob->id; $url = $ob->url; $aurl = $ob->avatar_url; print "*** FETCHING id: {$id}, {$url}\n"; mkdir("data/{$id}"); mkdir("data/{$id}/uploads"); file_put_contents("data/{$id}/profile.json", "{$json}\n"); get_meta($id, $aurl); crawl($id, $url); system("nice tar -cjvf data/{$id}.tar.bz2 data/{$id}"); system("rm -rf data/{$id}"); system("scp data/{$id}.tar.bz2 steve@10.0.0.77:/temp/soundcloud"); system("rm -rf data/{$id}.tar.bz2"); print "*** DONE id: {$id}, {$url}\n"; $msg->delivery_info['channel']->basic_ack($msg->delivery_info['delivery_tag']); }
function crawl($sUrl, $sCategory) { global $oDb; static $aSeen = array(); if (isset($aSeen[$sUrl])) { return; } $aMasterParts = parse_url($sUrl); //get the parts of the url so we can later check if we're only crawling the same domain $aSeen[$sUrl] = true; //if we crawled this page in the past we ignore it trace(" >> CRAWLING: {$sUrl}"); if (substr($sUrl, -4) == ".pdf" || substr($sUrl, -4) == ".rar" || substr($sUrl, -4) == ".zip" || substr($sUrl, -5) == ".docx" || substr($sUrl, -4) == ".doc") { trace("*** IGNORING: " . substr($sUrl, -4)); return; } $sRawContent = @file_get_contents($sUrl); //$sRawContent = replaceSpecialChars($sRawContent); //BUG: utf-8 encoding is still wrong $oDom = new DOMDocument('1.0'); if (!@$oDom->loadHTML($sRawContent)) { //crawl the given url //BUG: we cant store any cookies so sites relying on SESSION_ID will fail hard trace("* INVALID RESPONSE"); return; } $sRawContent = ""; //free the memory $sUrlHash = md5($sUrl); //create the url and content hash $sContent = $oDom->saveHTML(); $sContentHash = md5($sContent); trace(" -> Url Hash: {$sUrlHash}"); trace(" Content length: " . strlen($sContent) . " bytes"); trace(" ContentHash: " . $sContentHash); trace(); $oCursor = $oDb->pages->findOne(['urlHash' => $sUrlHash, 'contentHash' => $sContentHash]); if ($oCursor) { //content is found, not changed and already indexed, ignoring for now trace("****CONTENT IS NOT CHANGED!"); } else { //content is changed so add the urlHash and contentHash to the collection $oDb->pages->insert(['urlHash' => $sUrlHash, 'contentHash' => $sContentHash]); //parse the dom to get the new content parse($oDom, $sCategory, $sUrl); } //get all the links from the content and recursively crawl each valid link $oAnchors = $oDom->getElementsByTagName('a'); foreach ($oAnchors as $oElement) { $sHref = $oElement->getAttribute('href'); $aParts = parse_url($sHref); //make sure we're only getting the correct scheme: http or https if ($aParts['scheme'] == "javascript" || $aParts['scheme'] == "ftp") { //ignore ftp:// and javascript:// for now trace("** INVALID SCHEME: {$sHref}"); continue; } if (strpos($sHref, 'http') !== 0) { //if the path is relative we'll rebuild it $sPath = '/' . ltrim($sHref, '/'); $aParts = parse_url($sUrl); $sHref = $aParts['scheme'] . '://'; //http or https if (isset($aParts['user']) && isset($aParts['pass'])) { //if we have some sort of authentication, add it here user:pass@url $sHref .= $aParts['user'] . ':' . $aParts['pass'] . '@'; } $sHref .= $aParts['host']; if (isset($aParts['port'])) { $sHref .= ':' . $aParts['port']; //if we have the port other than 80, specified add it here } $sHref .= $sPath; } $aParts = parse_url($sHref); //double check to see if we're still on the correct scheme and if the target host is in the same scope as the current iteration host if (($aParts['scheme'] == "http" || $aParts['https']) && $aParts['host'] == $aMasterParts['host']) { crawl($sHref, $sCategory); } else { trace("*** IGNORING: {$sHref} (not in our scope)"); } } }
var_dump($result); exit; } function crawl(Shingetsu_Client $s) { $files = $s->recent(); rsort($files); $break_limit = 5; foreach ($files as $file) { echo date('Y-m-d H:i:s', $file['timestamp']) . $file['filename'] . PHP_EOL; if ($break_limit < 1) { return; } if ($s->have($file['filename'])) { $data = $s->get($file['filename'], '0-'); file_put_contents("data/{$file['filename']}", $data); chmod("data/{$file['filename']}", 0666); touch("data/{$file['filename']}", $file['timestamp']); sleep(1); } $break_limit -= 1; } } $s = new Shingetsu_Client(SERVER_ADDRESS); //$node = $s->node(); var_dump($node); exit; crawl(new Shingetsu_Client($s->node())); //$my_node = str_replace('/', '+', ':80/server.php'); //$result = $s->join($my_node); var_dump($result); exit; //$result = $s->have('thread_6F70657261'); var_dump($result); exit; //$result = $s->have('thread_E69CAC'); var_dump($result); exit; //get_thread($s, 'thread_503250');
function crawl_urls($urls) { include 'cc-settings.php'; include_once 'util.php'; $sth = $dbh->query("SELECT * FROM {$db_prefix}cast"); if ($sth) { $db = $sth->fetchAll(); $i = 0; $sth = $dbh->prepare("UPDATE {$db_prefix}cast SET xml=? WHERE url=?"); for ($j = 0; $j < sizeof($urls); $j += 16) { $feeds = multiHTTP(array_slice($urls, $j, 16)); foreach ($feeds as $feed) { $data = substr($feed, strpos($feed, "\r\n\r\n") + 4); echo $urls[$i] . "\n"; $entry = where($db, "URL", $urls[$i]); if ($entry != null) { if (strcmp($entry["XML"], $data) != 0) { $sth->execute(array($data, $urls[$i])); crawl($urls[$i], $data); } } else { crawl($urls[$i], $data); $sth->execute(array($data, $urls[$i])); } $i++; } } } }
$d4ds = $nmDataService->getForumHubDataService($forumid); $threads = $d4ds->getNewThreads($forumid, $lastCrawled); $createdat = 0; foreach ($threads as $t) { $url = $t['url']; $exist = $ds->getThreadByUrl($url); if (!$exist) { $current_createdat = $t['createdat']; if ($current_createdat > $createdat) { $createdat = $current_createdat; } libxml_use_internal_errors(true); $start = time(); $logService->log('D4CRAWLER', 'd4crawler new thread forumid=' . $forumid, var_log($url, '$url'), 'd4crawler'); try { @crawl($url, '', '', '', '', '', false); } catch (Exception $x) { echo $x; $logService->log('ERROR', 'd4 crawler exception', log_var($x, '$x'), 'd4crawler'); } } else { //$logService->log('D4CRAWLER','Thread exists already ',var_log($url,'$url'),'d4crawler'); } } $logService->log('D4CRAWLER', 'Forum finished forumid=' . $forumid, var_log($url, '$url'), 'd4crawler'); if ($createdat) { $ds->updateForumLastCrawled($forumid, $createdat); } } function crawl($url, $default_date, $default_description, $default_language, $default_title, $default_site_name, $nocrawl) {
function subscribe_to($feedurl, $name = null, $label = null, $crawl = false) { $userid = $this->app->userid; if ($crawl) { $castid = crawl($feedurl); } else { $sth = $this->dbh->query("SELECT CastID FROM {$this->db_prefix}cast WHERE url='{$feedurl}'"); $castid = $sth->fetch(PDO::FETCH_ASSOC)['CastID']; } if ($label == null) { $label = "root"; } if ($name == null) { $castinfo = $this->get_cast($castid); if (array_key_exists("title", $castinfo)) { $name = $castinfo["title"]; } else { $name = $feedurl; } } $sth = $this->dbh->query("SELECT * FROM {$this->db_prefix}subscription WHERE castid={$castid} AND userid={$userid}"); if ($sth && $sth->rowCount() < 1) { $sth = $this->dbh->prepare("INSERT INTO {$this->db_prefix}subscription (castid, name, userid) \n\t\t\tVALUES({$castid}, :name, {$userid})"); $sth->bindParam(":name", $name); $sth->execute(); $this->add_to_label("cast/" . $castid, $label); } return array("id" => $castid, "name" => $name, "url" => $feedurl, "feed" => $castinfo); }
public function build_files() { $this->files = array("driver"=>array(), "ext"=>array(), "util"=>array(), "shared"=>array(), "include"=>array()); crawl($this->files["driver"], "framework/driver", array("!\.c$!", "!\.h$!", "!\.inc$!"), false); crawl($this->files["driver"], "framework/driver/x86", array("!\.c$!", "!\.S$!", "!\.h$!", "!\.inc$!"), false); crawl($this->files["ext"], "app/extensions", array("!\.c$!", "!\.S$!", "!\.inc$!", "!\.h$!"), true); crawl($this->files["include"], "app/include", array("!\.h$!"), false); crawl($this->files["include"], "framework/include", array("!\.h$!"), false); crawl($this->files["shared"], "framework", array("!main_shared\.c$!"), false); crawl($this->files["util"], "framework", array("!main_util\.c$!", "!fuzz\.c$!", "!bench\.c$!"), true); $this->projects["lib"]["files"] = array("driver", "ext", "include"); $this->projects["dll"]["files"] = array("driver", "ext", "include", "shared"); $this->projects["util"]["files"] = array("driver", "ext", "include", "util"); }
} function crawled($id) { $sql = 'UPDATE `all_url` SET done=1 WHERE id=' . $id; mysql_query($sql); } function add_language_if_not($i) { mysql_query("CREATE TABLE IF NOT EXISTS `" . $i . "_sentence` (\r\n\t `id` int(11) NOT NULL AUTO_INCREMENT,\r\n\t `sen` varchar(1000) NOT NULL,\r\n\t `done` int(11) NOT NULL DEFAULT '0',\r\n\t `length` int(11) NOT NULL,\r\n\t `words` int(11) NOT NULL,\r\n\t PRIMARY KEY (`id`)\r\n\t)"); mysql_query("CREATE TABLE IF NOT EXISTS `" . $i . "_words` (\r\n\t `word` varchar(50) NOT NULL,\r\n\t `download` int(10) unsigned NOT NULL DEFAULT '0',\r\n\t `occur` int(10) unsigned NOT NULL DEFAULT '0',\r\n\t PRIMARY KEY (`word`)\r\n\t)"); } $query = "SELECT * FROM `all_url` WHERE done=0 AND re>-1 ORDER BY `urgent` DESC,`id` ASC LIMIT 0,5"; $result = mysql_query($query); while ($row = mysql_fetch_array($result)) { /* Yes this has been crawled now. Left on top i.e. even before crawling is actually done because : 1. We can't effort breaking script for any particular URL 2. This insure the buggy URLs even get passed. 3. Rest is secret. */ crawled($row['id']); //Check if tables exist, else make it quickly..! add_language_if_not($row['language']); //lets crawl now as everything is set.. :D crawl($row); //Crawl and do anything you wish with this URL.. :P //lets go to sleep enough work for now. // Reason : we cant effort to bug any webserve rby crawling again and again. sleep(0.3); }
<?php session_start(); $_SESSION['key'] = $_GET["q"]; //crawl the data in Taobao and Amazon require "search.php"; crawl($_SESSION['key']); ?> <!DOCTYPE html> <html class="dk_fouc has-js" lang="en"> <head> <title>2tao</title> <meta content="width=device-width, initial-scale=1.0" name="viewport"> <link rel="stylesheet" href="css/bootstrap.css"> <link rel="stylesheet" href="css/flat-ui.css"> <link href="images/megaphone.ico" rel="shortcut icon"> </head> <body> <div class="container"> <div class="span12"> <dl class="palette palette-info"> <form class="row" action="item.php" method="get"> <div class="span2"><h3><a href="index.php">2tao</a></h3><dd>@ZHY</dd></div> <div class="span6"> <input class="span6" type="text" placeholder="" value="" name="q"/> </div> <div class="span2"> <input class="btn btn-primary btn-large btn-block" type="submit" value="Search" /> </div> </form>
{ $C = new WSCrawler(); $C->setURL($u); $C->addContentTypeReceiveRule("#text/html#"); $C->addURLFilterRule("#(jpg|gif|png|pdf|jpeg|svg|css|js)\$# i"); if (!isset($GLOBALS['bgFull'])) { $C->setTrafficLimit(2000 * 1024); } $C->obeyRobotsTxt(true); $C->obeyNoFollowTags(true); $C->setUserAgentString("DingoBot (http://search.subinsb.com/about/bot.php)"); $C->setFollowMode(0); $C->go(); } if (!isset($url4Array)) { // Get the last indexed URLs (If there isn't, use default URL's) & start Crawling $last = $dbh->query("SELECT `url` FROM search"); $count = $last->rowCount(); if ($count < 1) { crawl("http://subinsb.com"); // The Default URL #1 } else { $urls = $last->fetchAll(); $index = rand(0, $count - 1); crawl($urls[$index]['url']); } } elseif (is_array($url4Array)) { foreach ($url4Array as $url) { crawl($url); } }
foreach ((array) $results as $key => $values) { echo $key . ' ' . count($values) . " found\n"; } return; } echo count($results) . " found\n"; } list($function, $params) = parseParams($argv); if ($function === 'crawl.php') { if (empty($params[0]) && empty($params['uri'])) { return crawlHelp(); } $processor = $processorFile = null; $uri = $params[0]; if (!strpos($uri, '://')) { $uri = 'http://' . $uri; } extract($params); if (!$processor) { $parts = parse_url($uri); $processor = strtolower(str_replace(array('www.', '.com'), '', $parts['host'])); $processorFile = $processor . '_processor.php'; if (!file_exists(dirname(__FILE__) . '/' . $processor . '_processor.php')) { $processor = 'Generic'; $processorFile = 'generic_processor.php'; } } elseif (!$processorFile) { $processorFile = $processor . '_processor.php'; } crawl($uri, $params, $processor, $processorFile); }
$sql = 'UPDATE `all_url` SET done=1 WHERE id=' . $id; mysql_query($sql); } function add_language_if_not($i) { mysql_query("CREATE TABLE IF NOT EXISTS `" . $i . "_sentence` (\r\n\t `id` int(11) NOT NULL AUTO_INCREMENT,\r\n\t `sen` varchar(1000) NOT NULL,\r\n\t `done` int(11) NOT NULL DEFAULT '0',\r\n\t `length` int(11) NOT NULL,\r\n\t `words` int(11) NOT NULL,\r\n\t PRIMARY KEY (`id`)\r\n\t)"); mysql_query("CREATE TABLE IF NOT EXISTS `" . $i . "_words` (\r\n\t `word` varchar(50) NOT NULL,\r\n\t `download` int(10) unsigned NOT NULL DEFAULT '0',\r\n\t `occur` int(10) unsigned NOT NULL DEFAULT '0',\r\n\t PRIMARY KEY (`word`)\r\n\t)"); } $query = "SELECT * FROM `all_url` WHERE done=0 ORDER BY `urgent` DESC,`id` ASC LIMIT 0,1"; $result = mysql_query($query); while ($row = mysql_fetch_array($result)) { /* Yes this has been crawled now. Left on top i.e. even before crawling is actually done because : 1. We can't effort breaking script for any particular URL 2. This insure the buggy URLs even get passed. 3. Rest is secret. */ // crawled($row['id']); //Check if tables exist, else make it quickly..! // add_language_if_not($row['language']); //lets crawl now as everything is set.. :D //var_dump($row); $testtt = array(); $testtt["url"] = $_GET['URL']; crawl($testtt); //Crawl and do anything you wish with this URL.. :P //lets go to sleep enough work for now. // Reason : we cant effort to bug any webserve rby crawling again and again. sleep(0.3); }
#Renew accessToken renewAccessToken(); } $out[$currentPost['id']]['id'] = $currentPost['id']; $out[$currentPost['id']]['status'] = "done"; $out[$currentPost['id']]['type'] = $currentPost['type']; $data = array(); $start_time = microtime(true); try { if ($currentPost['type'] == "page") { if (isset($currentPost['data'])) { $data = $currentPost['data']; } $data = fb_page_extract($currentPost['id'], $facebook, $data); } else { $data = crawl($currentPost['id'], $facebook); } } catch (Exception $e) { print "-- Interrupted @ " . get_execution_time(true) . "<br/>\n"; flush(); ob_flush(); error_log(microtime(1) . ";" . $e->getCode() . ";[" . get_class($e) . "]" . $e->getMessage() . ";" . $currentPost['id'] . "\n", 3, dirname($_SERVER['SCRIPT_FILENAME']) . "/log/error.log"); $out[$currentPost['id']]['status'] = "error"; $out[$currentPost['id']]['error_msg'] = $e->getMessage(); } $out[$currentPost['id']]['exec_time'] = microtime(true) - $start_time; $out[$currentPost['id']]['data'] = $data; //file_put_contents('outputs/'.$currentPost['id'], json_encode($out[$currentPost['id']])); } //Push changes for ($i = 0; $i < 10; $i++) {
<script src="js/bootstrap.min.js"></script> <!-- Script to Activate the Carousel --> <script> $('.carousel').carousel({ interval: 5000 //changes the speed }) // function log(txt) { // $("#log").html("location : <b>" + txt + "</b> px") // } // $(function() { // var eTop = $('.navbar').offset().top; //get the offset top of the element // log(eTop - $(window).scrollTop()); //position of the ele w.r.t window // $(window).scroll(function() { //when window is scrolled // log(eTop - $(window).scrollTop()); // }); // }); </script> <script src="js/backpacker.js"></script> </body> <?php include 'crawler-test.php'; crawl(); ?> </html>
flush(); } } } /** * crawl method * Create the crawler class object and set the options for crawling * @param type $u URL */ function crawl($u) { $C = new MyCrawler(); $C->setURL($u); $C->addContentTypeReceiveRule("#text/html#"); /* Only receive HTML pages */ $C->addURLFilterRule("#(jpg|gif|png|pdf|jpeg|svg|css|js)\$# i"); /* We don't want to crawl non HTML pages */ $C->setTrafficLimit(2000 * 1024); $C->obeyRobotsTxt(true); /* Should We follow robots.txt */ $C->go(); } //get URL from url table $links = $capsule->table('url')->get(); foreach ($links as $link) { $crawlURL = $link->url; if (isset($crawlURL)) { //Start crawling crawl($crawlURL); } }