<?php require_once 'crawl.php'; //$str = '203.208.60.184 www.tootoo.com - [19/Jul/2010:10:01:02 +8000] "GET /buy-holz-Q_ISO9000/ HTTP/1.1" 200 3435 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-" "255013"'; $str = '203.208.60.184 www.tootoo.com - [19/Jul/2010:10:01:02 +8000] "GET /s-ps/calcium-chloride--p-1323192.html HTTP/1.1" 200 3435 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-" "255013"'; //preg_match('/"(\d+)"$/', $str, $row); //$pattern = '|GET\s(/buy-(.*?))\sHTTP/|is'; //preg_match($pattern, $str, $row); //var_dump($row); $c = new crawl(); $c->log = $str; //$c->crawler = 'Googlebot'; //$url = $c->get_url(); $c->get_time(); if ($c->is_crawler()) { echo 'Googlebot'; } else { echo 'no'; } if ($c->exists_url('s-')) { //echo $c->url; //echo 'yes'; } else { //echo 'no'; }
<?php include_once 'crawl.php'; // prevent the program from timing out set_time_limit(0); $crawler = new crawl(); $crawler->startCrawl();
<div> <?php if (isset($_GET['message'])) { echo $_GET['message']; } ?> </div> <form action="handlers/crawl.php" method="post" class="push"> <input id="round" type="submit" value="All Data" name="alldata" /> <input id="round" type="submit" value="All Images" name="allimages" /> </form> <?php require_once 'init.php'; $crawl = new crawl(new dbc($config), new iniDOM(1, new dbc($config))); $crawl->data_view();
<?php ini_set('max_execution_time', 60 * 60); require_once '../init.php'; if ($_SERVER['REQUEST_METHOD'] != 'POST') { header('location:../404.php'); } extract($_POST); extract($_GET); $crawl = new crawl(new dbc($config), new iniDOM(1, new dbc($config))); if (isset($alldata)) { if ($crawl->get_all_data()) { header('location:../crawl.php?message=all data has been inserted'); } else { header('location:../crawl.php?message=failed to get/insert data'); } } if (isset($allimages)) { if ($crawl->get_all_images()) { header('location:../crawl.php?message=images has been downloaded'); } else { header('location:../crawl.php?message=failed to download images'); } } if (isset($data)) { if ($crawl->get_data($id)) { header('location:../crawl.php?message=data has been inserted'); } else { header('location:../crawl.php?message=failed to insert data'); } }
public function newtable() { crawl::$table = new db("localhost", "root", "", "bigdata"); // why use this beacuase in EOSQL statement // it error like undefined databse or syntax $table = crawl::$tabledomain; $newtable = <<<EOSQL CREATE TABLE IF NOT EXISTS {$table} ( wen_no BIGINT AUTO_INCREMENT , wen_name TEXT NOT NULL , wen_hash varchar(128) UNIQUE, PRIMARY KEY (wen_no) ); EOSQL; crawl::$table->query($newtable); $this->insert(crawl::$domain); }
public function callofperfection($pos) { //print_r(crawl::$obj); if ($pos > 400) { //print_r(crawl::$obj); exit; } else { crawl::$fordomaincount = $pos + 1; $key = crawl::$obj[$pos + 1]; $key = new crawl($key); } }
} //收录统计 $r = new record(); $curr_record = $r->get_records_by_day($date, $engine); $last_record = $r->get_records_by_day($prevdate, $engine); //收录概要 $record_summary = array(); if (isset($curr_record['data']) && count($curr_record['data']) > 0) { $curr_rc = current($curr_record['data']); if ($last_record['data']) { $prev_rc = current($last_record['data']); } $record_summary = summary_diff($curr_rc, $prev_rc); } //抓取统计 $crawl = new crawl(); $curr_crawl = $crawl->get_crawls_by_date($date, $engine); $last_crawl = $crawl->get_crawls_by_date($prevdate, $engine); //抓取概要 $crawl_summary = summary_diff($curr_crawl['all']['count'], $last_crawl['all']['count']); //点击率统计 $hit = new hit(); $curr_hits = $hit->get_hits_by_day($date); $prev_hits = $hit->get_hits_by_day($prevdate); //var_dump($prev_hits); $tpl->assign('sitename', $site); $tpl->assign('engine', $engine); $tpl->assign('today', $date); $tpl->assign('lastday', $prevdate); $tpl->assign('calender_str', $calender_str); $tpl->assign('record_summary', $record_summary);
<?php include_once 'crawl.php'; $crawler = new crawl(); $crawler->printLetters();
<?php require_once 'global.php'; $engine = isset($_GET['engine']) && !empty($_GET['engine']) ? $_GET['engine'] : ''; $site = isset($_GET['sitename']) && !empty($_GET['sitename']) ? $_GET['sitename'] : ''; $url = isset($_GET['url']) && !empty($_GET['url']) ? $_GET['url'] : ''; $key = isset($_GET['key']) && !empty($_GET['key']) ? $_GET['key'] : ''; $date = isset($_GET['date']) && !empty($_GET['date']) ? $_GET['date'] : ''; if (!$engine || !$site || !$url || !$date) { echo 'Error!'; exit; } $timestamp = strtotime($date); $r = new crawl(); //当前月数据 $curr_tmp = $r->get_crawl_by_url($url, $key, $date); //上月数据 $days = date('t', $timestamp); $prev_tmp = $r->get_crawl_by_url($url, $key, date('Y-n-d', $timestamp - $days * 24 * 3600)); //月份参数 $filters = array('yr' => date('Y', $timestamp), 'mo' => date('n', $timestamp)); $curr_data = $prev_data = array(); if ($curr_tmp) { foreach ($curr_tmp as $key => $val) { $curr_data['dy'][$key] = $val; } } if ($prev_tmp) { foreach ($prev_tmp as $key => $val) { $prev_data['dy'][$key] = $val; }