Example #1
0
<?php

require_once 'crawl.php';
//$str = '203.208.60.184 www.tootoo.com - [19/Jul/2010:10:01:02 +8000] "GET /buy-holz-Q_ISO9000/ HTTP/1.1" 200 3435 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-" "255013"';
$str = '203.208.60.184 www.tootoo.com - [19/Jul/2010:10:01:02 +8000] "GET /s-ps/calcium-chloride--p-1323192.html HTTP/1.1" 200 3435 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-" "255013"';
//preg_match('/"(\d+)"$/', $str, $row);
//$pattern = '|GET\s(/buy-(.*?))\sHTTP/|is';
//preg_match($pattern, $str, $row);
//var_dump($row);
$c = new crawl();
$c->log = $str;
//$c->crawler = 'Googlebot';
//$url = $c->get_url();
$c->get_time();
if ($c->is_crawler()) {
    echo 'Googlebot';
} else {
    echo 'no';
}
if ($c->exists_url('s-')) {
    //echo $c->url;
    //echo 'yes';
} else {
    //echo 'no';
}
Example #2
0
<?php

include_once 'crawl.php';
// prevent the program from timing out
set_time_limit(0);
$crawler = new crawl();
$crawler->startCrawl();
Example #3
0
<div>
<?php 
if (isset($_GET['message'])) {
    echo $_GET['message'];
}
?>
</div>	
	<form action="handlers/crawl.php" method="post" class="push">
		<input id="round" type="submit" value="All Data" name="alldata" />
		<input id="round" type="submit" value="All Images" name="allimages" />
	</form>
<?php 
require_once 'init.php';
$crawl = new crawl(new dbc($config), new iniDOM(1, new dbc($config)));
$crawl->data_view();
Example #4
0
<?php

ini_set('max_execution_time', 60 * 60);
require_once '../init.php';
if ($_SERVER['REQUEST_METHOD'] != 'POST') {
    header('location:../404.php');
}
extract($_POST);
extract($_GET);
$crawl = new crawl(new dbc($config), new iniDOM(1, new dbc($config)));
if (isset($alldata)) {
    if ($crawl->get_all_data()) {
        header('location:../crawl.php?message=all data has been inserted');
    } else {
        header('location:../crawl.php?message=failed to get/insert data');
    }
}
if (isset($allimages)) {
    if ($crawl->get_all_images()) {
        header('location:../crawl.php?message=images has been downloaded');
    } else {
        header('location:../crawl.php?message=failed to download images');
    }
}
if (isset($data)) {
    if ($crawl->get_data($id)) {
        header('location:../crawl.php?message=data has been inserted');
    } else {
        header('location:../crawl.php?message=failed to insert data');
    }
}
Example #5
0
    public function newtable()
    {
        crawl::$table = new db("localhost", "root", "", "bigdata");
        // why use this beacuase in EOSQL statement
        // it error like undefined databse or syntax
        $table = crawl::$tabledomain;
        $newtable = <<<EOSQL
                CREATE TABLE IF NOT EXISTS {$table} (
                  wen_no BIGINT AUTO_INCREMENT ,
                  wen_name TEXT NOT NULL ,
                  wen_hash varchar(128) UNIQUE,
                  PRIMARY KEY (wen_no)
                  );

EOSQL;
        crawl::$table->query($newtable);
        $this->insert(crawl::$domain);
    }
Example #6
0
 public function callofperfection($pos)
 {
     //print_r(crawl::$obj);
     if ($pos > 400) {
         //print_r(crawl::$obj);
         exit;
     } else {
         crawl::$fordomaincount = $pos + 1;
         $key = crawl::$obj[$pos + 1];
         $key = new crawl($key);
     }
 }
Example #7
0
}
//收录统计
$r = new record();
$curr_record = $r->get_records_by_day($date, $engine);
$last_record = $r->get_records_by_day($prevdate, $engine);
//收录概要
$record_summary = array();
if (isset($curr_record['data']) && count($curr_record['data']) > 0) {
    $curr_rc = current($curr_record['data']);
    if ($last_record['data']) {
        $prev_rc = current($last_record['data']);
    }
    $record_summary = summary_diff($curr_rc, $prev_rc);
}
//抓取统计
$crawl = new crawl();
$curr_crawl = $crawl->get_crawls_by_date($date, $engine);
$last_crawl = $crawl->get_crawls_by_date($prevdate, $engine);
//抓取概要
$crawl_summary = summary_diff($curr_crawl['all']['count'], $last_crawl['all']['count']);
//点击率统计
$hit = new hit();
$curr_hits = $hit->get_hits_by_day($date);
$prev_hits = $hit->get_hits_by_day($prevdate);
//var_dump($prev_hits);
$tpl->assign('sitename', $site);
$tpl->assign('engine', $engine);
$tpl->assign('today', $date);
$tpl->assign('lastday', $prevdate);
$tpl->assign('calender_str', $calender_str);
$tpl->assign('record_summary', $record_summary);
Example #8
0
<?php

include_once 'crawl.php';
$crawler = new crawl();
$crawler->printLetters();
Example #9
0
<?php

require_once 'global.php';
$engine = isset($_GET['engine']) && !empty($_GET['engine']) ? $_GET['engine'] : '';
$site = isset($_GET['sitename']) && !empty($_GET['sitename']) ? $_GET['sitename'] : '';
$url = isset($_GET['url']) && !empty($_GET['url']) ? $_GET['url'] : '';
$key = isset($_GET['key']) && !empty($_GET['key']) ? $_GET['key'] : '';
$date = isset($_GET['date']) && !empty($_GET['date']) ? $_GET['date'] : '';
if (!$engine || !$site || !$url || !$date) {
    echo 'Error!';
    exit;
}
$timestamp = strtotime($date);
$r = new crawl();
//当前月数据
$curr_tmp = $r->get_crawl_by_url($url, $key, $date);
//上月数据
$days = date('t', $timestamp);
$prev_tmp = $r->get_crawl_by_url($url, $key, date('Y-n-d', $timestamp - $days * 24 * 3600));
//月份参数
$filters = array('yr' => date('Y', $timestamp), 'mo' => date('n', $timestamp));
$curr_data = $prev_data = array();
if ($curr_tmp) {
    foreach ($curr_tmp as $key => $val) {
        $curr_data['dy'][$key] = $val;
    }
}
if ($prev_tmp) {
    foreach ($prev_tmp as $key => $val) {
        $prev_data['dy'][$key] = $val;
    }