Exemple #1
0
function crawlAll($tname, $styear)
{
    for ($year = $styear; $year <= 2015; $year++) {
        for ($period = 1; $period <= 4; $period++) {
            crawl($tname, $year, $period, null);
        }
    }
}
function process_message($msg)
{
    global $debug;
    global $useragent;
    $json = $msg->body;
    $ob = json_decode($json);
    $id = $ob->id;
    print "ID: {$id}\n";
    crawl($id);
    $msg->delivery_info['channel']->basic_ack($msg->delivery_info['delivery_tag']);
}
Exemple #3
0
function crawl($dir)
{
    $dir_handle = @opendir($dir) or die("Unable to open directory");
    echo "Directory Listing: {$dir}<br/>";
    while ($file = readdir($dir_handle)) {
        //$file = rawurlencode($file);
        //$url = str_replace('+' , '%20' , $file);
        if ($file != ".." && $file != ".") {
            echo "{$file}<br>";
            if (strpos($file, "~") == strlen($file) - 1) {
                echo "<b>{$file}</b><br>";
                unlink($dir . "/" . $file);
            }
            if (is_dir($dir . "/" . $file)) {
                crawl($dir . "/" . $file);
            }
        }
    }
    closedir($dir_handle);
}
function process_message($msg)
{
    global $debug;
    global $useragent;
    $json = $msg->body;
    $ob = json_decode($json);
    $id = $ob->id;
    $url = $ob->url;
    $aurl = $ob->avatar_url;
    print "*** FETCHING id: {$id}, {$url}\n";
    mkdir("data/{$id}");
    mkdir("data/{$id}/uploads");
    file_put_contents("data/{$id}/profile.json", "{$json}\n");
    get_meta($id, $aurl);
    crawl($id, $url);
    system("nice tar -cjvf data/{$id}.tar.bz2 data/{$id}");
    system("rm -rf data/{$id}");
    system("scp data/{$id}.tar.bz2 steve@10.0.0.77:/temp/soundcloud");
    system("rm -rf data/{$id}.tar.bz2");
    print "*** DONE id: {$id}, {$url}\n";
    $msg->delivery_info['channel']->basic_ack($msg->delivery_info['delivery_tag']);
}
Exemple #5
0
function crawl($sUrl, $sCategory)
{
    global $oDb;
    static $aSeen = array();
    if (isset($aSeen[$sUrl])) {
        return;
    }
    $aMasterParts = parse_url($sUrl);
    //get the parts of the url so we can later check if we're only crawling the same domain
    $aSeen[$sUrl] = true;
    //if we crawled this page in the past we ignore it
    trace(" >> CRAWLING: {$sUrl}");
    if (substr($sUrl, -4) == ".pdf" || substr($sUrl, -4) == ".rar" || substr($sUrl, -4) == ".zip" || substr($sUrl, -5) == ".docx" || substr($sUrl, -4) == ".doc") {
        trace("*** IGNORING: " . substr($sUrl, -4));
        return;
    }
    $sRawContent = @file_get_contents($sUrl);
    //$sRawContent = replaceSpecialChars($sRawContent);
    //BUG: utf-8 encoding is still wrong
    $oDom = new DOMDocument('1.0');
    if (!@$oDom->loadHTML($sRawContent)) {
        //crawl the given url
        //BUG: we cant store any cookies so sites relying on SESSION_ID will fail hard
        trace("*   INVALID RESPONSE");
        return;
    }
    $sRawContent = "";
    //free the memory
    $sUrlHash = md5($sUrl);
    //create the url and content hash
    $sContent = $oDom->saveHTML();
    $sContentHash = md5($sContent);
    trace("     -> Url Hash: {$sUrlHash}");
    trace("        Content length: " . strlen($sContent) . " bytes");
    trace("        ContentHash: " . $sContentHash);
    trace();
    $oCursor = $oDb->pages->findOne(['urlHash' => $sUrlHash, 'contentHash' => $sContentHash]);
    if ($oCursor) {
        //content is found, not changed and already indexed, ignoring for now
        trace("****CONTENT IS NOT CHANGED!");
    } else {
        //content is changed so add the urlHash and contentHash to the collection
        $oDb->pages->insert(['urlHash' => $sUrlHash, 'contentHash' => $sContentHash]);
        //parse the dom to get the new content
        parse($oDom, $sCategory, $sUrl);
    }
    //get all the links from the content and recursively crawl each valid link
    $oAnchors = $oDom->getElementsByTagName('a');
    foreach ($oAnchors as $oElement) {
        $sHref = $oElement->getAttribute('href');
        $aParts = parse_url($sHref);
        //make sure we're only getting the correct scheme: http or https
        if ($aParts['scheme'] == "javascript" || $aParts['scheme'] == "ftp") {
            //ignore ftp:// and javascript:// for now
            trace("**  INVALID SCHEME: {$sHref}");
            continue;
        }
        if (strpos($sHref, 'http') !== 0) {
            //if the path is relative we'll rebuild it
            $sPath = '/' . ltrim($sHref, '/');
            $aParts = parse_url($sUrl);
            $sHref = $aParts['scheme'] . '://';
            //http or https
            if (isset($aParts['user']) && isset($aParts['pass'])) {
                //if we have some sort of authentication, add it here user:pass@url
                $sHref .= $aParts['user'] . ':' . $aParts['pass'] . '@';
            }
            $sHref .= $aParts['host'];
            if (isset($aParts['port'])) {
                $sHref .= ':' . $aParts['port'];
                //if we have the port other than 80, specified add it here
            }
            $sHref .= $sPath;
        }
        $aParts = parse_url($sHref);
        //double check to see if we're still on the correct scheme and if the target host is in the same scope as the current iteration host
        if (($aParts['scheme'] == "http" || $aParts['https']) && $aParts['host'] == $aMasterParts['host']) {
            crawl($sHref, $sCategory);
        } else {
            trace("*** IGNORING: {$sHref} (not in our scope)");
        }
    }
}
Exemple #6
0
    var_dump($result);
    exit;
}
function crawl(Shingetsu_Client $s)
{
    $files = $s->recent();
    rsort($files);
    $break_limit = 5;
    foreach ($files as $file) {
        echo date('Y-m-d H:i:s', $file['timestamp']) . $file['filename'] . PHP_EOL;
        if ($break_limit < 1) {
            return;
        }
        if ($s->have($file['filename'])) {
            $data = $s->get($file['filename'], '0-');
            file_put_contents("data/{$file['filename']}", $data);
            chmod("data/{$file['filename']}", 0666);
            touch("data/{$file['filename']}", $file['timestamp']);
            sleep(1);
        }
        $break_limit -= 1;
    }
}
$s = new Shingetsu_Client(SERVER_ADDRESS);
//$node = $s->node(); var_dump($node); exit;
crawl(new Shingetsu_Client($s->node()));
//$my_node = str_replace('/', '+', ':80/server.php');
//$result = $s->join($my_node); var_dump($result); exit;
//$result = $s->have('thread_6F70657261'); var_dump($result); exit;
//$result = $s->have('thread_E69CAC'); var_dump($result); exit;
//get_thread($s, 'thread_503250');
function crawl_urls($urls)
{
    include 'cc-settings.php';
    include_once 'util.php';
    $sth = $dbh->query("SELECT * FROM {$db_prefix}cast");
    if ($sth) {
        $db = $sth->fetchAll();
        $i = 0;
        $sth = $dbh->prepare("UPDATE {$db_prefix}cast SET xml=? WHERE url=?");
        for ($j = 0; $j < sizeof($urls); $j += 16) {
            $feeds = multiHTTP(array_slice($urls, $j, 16));
            foreach ($feeds as $feed) {
                $data = substr($feed, strpos($feed, "\r\n\r\n") + 4);
                echo $urls[$i] . "\n";
                $entry = where($db, "URL", $urls[$i]);
                if ($entry != null) {
                    if (strcmp($entry["XML"], $data) != 0) {
                        $sth->execute(array($data, $urls[$i]));
                        crawl($urls[$i], $data);
                    }
                } else {
                    crawl($urls[$i], $data);
                    $sth->execute(array($data, $urls[$i]));
                }
                $i++;
            }
        }
    }
}
    $d4ds = $nmDataService->getForumHubDataService($forumid);
    $threads = $d4ds->getNewThreads($forumid, $lastCrawled);
    $createdat = 0;
    foreach ($threads as $t) {
        $url = $t['url'];
        $exist = $ds->getThreadByUrl($url);
        if (!$exist) {
            $current_createdat = $t['createdat'];
            if ($current_createdat > $createdat) {
                $createdat = $current_createdat;
            }
            libxml_use_internal_errors(true);
            $start = time();
            $logService->log('D4CRAWLER', 'd4crawler new thread forumid=' . $forumid, var_log($url, '$url'), 'd4crawler');
            try {
                @crawl($url, '', '', '', '', '', false);
            } catch (Exception $x) {
                echo $x;
                $logService->log('ERROR', 'd4 crawler exception', log_var($x, '$x'), 'd4crawler');
            }
        } else {
            //$logService->log('D4CRAWLER','Thread exists already ',var_log($url,'$url'),'d4crawler');
        }
    }
    $logService->log('D4CRAWLER', 'Forum finished forumid=' . $forumid, var_log($url, '$url'), 'd4crawler');
    if ($createdat) {
        $ds->updateForumLastCrawled($forumid, $createdat);
    }
}
function crawl($url, $default_date, $default_description, $default_language, $default_title, $default_site_name, $nocrawl)
{
 function subscribe_to($feedurl, $name = null, $label = null, $crawl = false)
 {
     $userid = $this->app->userid;
     if ($crawl) {
         $castid = crawl($feedurl);
     } else {
         $sth = $this->dbh->query("SELECT CastID FROM {$this->db_prefix}cast WHERE url='{$feedurl}'");
         $castid = $sth->fetch(PDO::FETCH_ASSOC)['CastID'];
     }
     if ($label == null) {
         $label = "root";
     }
     if ($name == null) {
         $castinfo = $this->get_cast($castid);
         if (array_key_exists("title", $castinfo)) {
             $name = $castinfo["title"];
         } else {
             $name = $feedurl;
         }
     }
     $sth = $this->dbh->query("SELECT * FROM {$this->db_prefix}subscription WHERE castid={$castid} AND userid={$userid}");
     if ($sth && $sth->rowCount() < 1) {
         $sth = $this->dbh->prepare("INSERT INTO {$this->db_prefix}subscription (castid, name, userid) \n\t\t\tVALUES({$castid}, :name, {$userid})");
         $sth->bindParam(":name", $name);
         $sth->execute();
         $this->add_to_label("cast/" . $castid, $label);
     }
     return array("id" => $castid, "name" => $name, "url" => $feedurl, "feed" => $castinfo);
 }
Exemple #10
0
	public function build_files() {
		$this->files = array("driver"=>array(), "ext"=>array(), "util"=>array(), "shared"=>array(), "include"=>array());
		crawl($this->files["driver"], "framework/driver", array("!\.c$!", "!\.h$!", "!\.inc$!"), false);
		crawl($this->files["driver"], "framework/driver/x86", array("!\.c$!", "!\.S$!", "!\.h$!", "!\.inc$!"), false);
		crawl($this->files["ext"], "app/extensions", array("!\.c$!", "!\.S$!", "!\.inc$!", "!\.h$!"), true);
		crawl($this->files["include"], "app/include", array("!\.h$!"), false);
		crawl($this->files["include"], "framework/include", array("!\.h$!"), false);
		crawl($this->files["shared"], "framework", array("!main_shared\.c$!"), false);
		crawl($this->files["util"], "framework", array("!main_util\.c$!", "!fuzz\.c$!", "!bench\.c$!"), true);

		$this->projects["lib"]["files"] = array("driver", "ext", "include");
		$this->projects["dll"]["files"] = array("driver", "ext", "include", "shared");
		$this->projects["util"]["files"] = array("driver", "ext", "include", "util");
	}
}
function crawled($id)
{
    $sql = 'UPDATE `all_url` SET done=1 WHERE id=' . $id;
    mysql_query($sql);
}
function add_language_if_not($i)
{
    mysql_query("CREATE TABLE IF NOT EXISTS `" . $i . "_sentence` (\r\n\t  `id` int(11) NOT NULL AUTO_INCREMENT,\r\n\t  `sen` varchar(1000) NOT NULL,\r\n\t  `done` int(11) NOT NULL DEFAULT '0',\r\n\t  `length` int(11) NOT NULL,\r\n\t  `words` int(11) NOT NULL,\r\n\t  PRIMARY KEY (`id`)\r\n\t)");
    mysql_query("CREATE TABLE IF NOT EXISTS `" . $i . "_words` (\r\n\t  `word` varchar(50) NOT NULL,\r\n\t  `download` int(10) unsigned NOT NULL DEFAULT '0',\r\n\t  `occur` int(10) unsigned NOT NULL DEFAULT '0',\r\n\t  PRIMARY KEY (`word`)\r\n\t)");
}
$query = "SELECT * FROM `all_url` WHERE done=0 AND re>-1 ORDER BY `urgent` DESC,`id` ASC LIMIT 0,5";
$result = mysql_query($query);
while ($row = mysql_fetch_array($result)) {
    /*
    Yes this has been crawled now.
    Left on top i.e. even before crawling is actually done because :
    1. We can't effort breaking script for any particular URL
    2. This insure the buggy URLs even get passed.
    3. Rest is secret.
    */
    crawled($row['id']);
    //Check if tables exist, else make it quickly..!
    add_language_if_not($row['language']);
    //lets crawl now as everything is set.. :D
    crawl($row);
    //Crawl and do anything you wish with this URL.. :P
    //lets go to sleep enough work for now.
    // Reason : we cant effort to bug any webserve rby crawling again and again.
    sleep(0.3);
}
Exemple #12
0
<?php

session_start();
$_SESSION['key'] = $_GET["q"];
//crawl the data in Taobao and Amazon
require "search.php";
crawl($_SESSION['key']);
?>

<!DOCTYPE html>                                                                                              
<html  class="dk_fouc has-js" lang="en">
  <head>
    <title>2tao</title>
    <meta content="width=device-width, initial-scale=1.0" name="viewport">
    <link rel="stylesheet" href="css/bootstrap.css">
    <link rel="stylesheet" href="css/flat-ui.css">
    <link href="images/megaphone.ico" rel="shortcut icon">
  </head>
  <body>
    <div class="container">
	  <div class="span12">
    <dl class="palette palette-info">
      <form class="row" action="item.php" method="get">
			<div class="span2"><h3><a href="index.php">2tao</a></h3><dd>@ZHY</dd></div>
			<div class="span6">
				<input class="span6" type="text" placeholder="" value="" name="q"/>
			</div>
			<div class="span2">
        <input class="btn btn-primary btn-large btn-block" type="submit" value="Search" />
			</div>
		  </form>
Exemple #13
0
{
    $C = new WSCrawler();
    $C->setURL($u);
    $C->addContentTypeReceiveRule("#text/html#");
    $C->addURLFilterRule("#(jpg|gif|png|pdf|jpeg|svg|css|js)\$# i");
    if (!isset($GLOBALS['bgFull'])) {
        $C->setTrafficLimit(2000 * 1024);
    }
    $C->obeyRobotsTxt(true);
    $C->obeyNoFollowTags(true);
    $C->setUserAgentString("DingoBot (http://search.subinsb.com/about/bot.php)");
    $C->setFollowMode(0);
    $C->go();
}
if (!isset($url4Array)) {
    // Get the last indexed URLs (If there isn't, use default URL's) & start Crawling
    $last = $dbh->query("SELECT `url` FROM search");
    $count = $last->rowCount();
    if ($count < 1) {
        crawl("http://subinsb.com");
        // The Default URL #1
    } else {
        $urls = $last->fetchAll();
        $index = rand(0, $count - 1);
        crawl($urls[$index]['url']);
    }
} elseif (is_array($url4Array)) {
    foreach ($url4Array as $url) {
        crawl($url);
    }
}
Exemple #14
0
        foreach ((array) $results as $key => $values) {
            echo $key . ' ' . count($values) . " found\n";
        }
        return;
    }
    echo count($results) . " found\n";
}
list($function, $params) = parseParams($argv);
if ($function === 'crawl.php') {
    if (empty($params[0]) && empty($params['uri'])) {
        return crawlHelp();
    }
    $processor = $processorFile = null;
    $uri = $params[0];
    if (!strpos($uri, '://')) {
        $uri = 'http://' . $uri;
    }
    extract($params);
    if (!$processor) {
        $parts = parse_url($uri);
        $processor = strtolower(str_replace(array('www.', '.com'), '', $parts['host']));
        $processorFile = $processor . '_processor.php';
        if (!file_exists(dirname(__FILE__) . '/' . $processor . '_processor.php')) {
            $processor = 'Generic';
            $processorFile = 'generic_processor.php';
        }
    } elseif (!$processorFile) {
        $processorFile = $processor . '_processor.php';
    }
    crawl($uri, $params, $processor, $processorFile);
}
    $sql = 'UPDATE `all_url` SET done=1 WHERE id=' . $id;
    mysql_query($sql);
}
function add_language_if_not($i)
{
    mysql_query("CREATE TABLE IF NOT EXISTS `" . $i . "_sentence` (\r\n\t  `id` int(11) NOT NULL AUTO_INCREMENT,\r\n\t  `sen` varchar(1000) NOT NULL,\r\n\t  `done` int(11) NOT NULL DEFAULT '0',\r\n\t  `length` int(11) NOT NULL,\r\n\t  `words` int(11) NOT NULL,\r\n\t  PRIMARY KEY (`id`)\r\n\t)");
    mysql_query("CREATE TABLE IF NOT EXISTS `" . $i . "_words` (\r\n\t  `word` varchar(50) NOT NULL,\r\n\t  `download` int(10) unsigned NOT NULL DEFAULT '0',\r\n\t  `occur` int(10) unsigned NOT NULL DEFAULT '0',\r\n\t  PRIMARY KEY (`word`)\r\n\t)");
}
$query = "SELECT * FROM `all_url` WHERE done=0 ORDER BY `urgent` DESC,`id` ASC LIMIT 0,1";
$result = mysql_query($query);
while ($row = mysql_fetch_array($result)) {
    /*
    Yes this has been crawled now.
    Left on top i.e. even before crawling is actually done because :
    1. We can't effort breaking script for any particular URL
    2. This insure the buggy URLs even get passed.
    3. Rest is secret.
    */
    //	crawled($row['id']);
    //Check if tables exist, else make it quickly..!
    //	add_language_if_not($row['language']);
    //lets crawl now as everything is set.. :D
    //var_dump($row);
    $testtt = array();
    $testtt["url"] = $_GET['URL'];
    crawl($testtt);
    //Crawl and do anything you wish with this URL.. :P
    //lets go to sleep enough work for now.
    // Reason : we cant effort to bug any webserve rby crawling again and again.
    sleep(0.3);
}
Exemple #16
0
         #Renew accessToken
         renewAccessToken();
     }
     $out[$currentPost['id']]['id'] = $currentPost['id'];
     $out[$currentPost['id']]['status'] = "done";
     $out[$currentPost['id']]['type'] = $currentPost['type'];
     $data = array();
     $start_time = microtime(true);
     try {
         if ($currentPost['type'] == "page") {
             if (isset($currentPost['data'])) {
                 $data = $currentPost['data'];
             }
             $data = fb_page_extract($currentPost['id'], $facebook, $data);
         } else {
             $data = crawl($currentPost['id'], $facebook);
         }
     } catch (Exception $e) {
         print "-- Interrupted @ " . get_execution_time(true) . "<br/>\n";
         flush();
         ob_flush();
         error_log(microtime(1) . ";" . $e->getCode() . ";[" . get_class($e) . "]" . $e->getMessage() . ";" . $currentPost['id'] . "\n", 3, dirname($_SERVER['SCRIPT_FILENAME']) . "/log/error.log");
         $out[$currentPost['id']]['status'] = "error";
         $out[$currentPost['id']]['error_msg'] = $e->getMessage();
     }
     $out[$currentPost['id']]['exec_time'] = microtime(true) - $start_time;
     $out[$currentPost['id']]['data'] = $data;
     //file_put_contents('outputs/'.$currentPost['id'], json_encode($out[$currentPost['id']]));
 }
 //Push changes
 for ($i = 0; $i < 10; $i++) {
Exemple #17
0
    <script src="js/bootstrap.min.js"></script>

    <!-- Script to Activate the Carousel -->
    <script>
    $('.carousel').carousel({
        interval: 5000 //changes the speed
    })

// function log(txt) {
//   $("#log").html("location : <b>" + txt + "</b> px")
// }

// $(function() {
//   var eTop = $('.navbar').offset().top; //get the offset top of the element
//   log(eTop - $(window).scrollTop()); //position of the ele w.r.t window

//   $(window).scroll(function() { //when window is scrolled
//     log(eTop - $(window).scrollTop());
//   });
// });

    </script>

    <script src="js/backpacker.js"></script>

</body>
	<?php 
include 'crawler-test.php';
crawl();
?>
</html>
Exemple #18
0
            flush();
        }
    }
}
/**
 *  crawl method
 *  Create the crawler class object and set the options for crawling
 * @param type $u URL
 */
function crawl($u)
{
    $C = new MyCrawler();
    $C->setURL($u);
    $C->addContentTypeReceiveRule("#text/html#");
    /* Only receive HTML pages */
    $C->addURLFilterRule("#(jpg|gif|png|pdf|jpeg|svg|css|js)\$# i");
    /* We don't want to crawl non HTML pages */
    $C->setTrafficLimit(2000 * 1024);
    $C->obeyRobotsTxt(true);
    /* Should We follow robots.txt */
    $C->go();
}
//get URL from url table
$links = $capsule->table('url')->get();
foreach ($links as $link) {
    $crawlURL = $link->url;
    if (isset($crawlURL)) {
        //Start crawling
        crawl($crawlURL);
    }
}