public function crawl() { if (!Input::has('url')) { return $this->layout->content = View::make('crawlurl'); } $startURL = Input::get("url"); $depth = Input::get("depth"); $crawler = new crawler($startURL, $depth); //$crawler->setHttpAuth($username, $password); $crawler->run(); $i = 0; echo sizeof($crawler->getSeen()); foreach ($crawler->getSeen() as $url => $seen) { $archemy = new Archemy($url, 1); if ($archemy->status()) { $archemy->parseRelations(); //array_merge($relations,); } echo $archemy->response(); //echo $url."<br>"; } $relations = Sentence::where("full_text", $startURL); $this->layout->content = View::make('archemy.show', array("response" => "", "relations" => $relations, "data" => $startURL)); }
<?php //This crawler searches through the links in a domain, //makes an array from them and then searches them for emails // see it in action at http://jacksworkspace.com set_time_limit(6400); $array = array(); $emailarray = array(); $crawler = new crawler($startURL, $depth); $crawler->run(); class crawler { protected $_url; protected $_depth; protected $_host; protected $_useHttpAuth = false; protected $_user; protected $_pass; protected $_seen = array(); protected $_filter = array(); public function __construct($url, $depth = 5) { $this->_url = $url; $this->_depth = $depth; $parse = parse_url($url); $this->_host = $parse['host']; } protected function _processAnchors($content, $url, $depth) { $dom = new DOMDocument('1.0'); @$dom->loadHTML($content);
require_once dirname(__FILE__) . '/class.proxy.php'; require_once dirname(__FILE__) . '/class.detector.php'; $mysqli = new mysqli('10.168.45.191', 'admin', 'txg19831210', 'crawler'); $mysqli->query('SET NAMES gbk'); $queueName = 'q_crawler'; $params = array('host' => '10.168.45.191', 'port' => 5672, 'login' => 'guest', 'password' => 'guest', 'vhost' => '/kwd'); $conn = new AMQPConnection($params); $conn->connect(); $channel = new AMQPChannel($conn); $queue = new AMQPQueue($channel); $queue->setName($queueName); $crawler = new crawler(); while ($message = $queue->get(AMQP_AUTOACK)) { $kwd = $message->getBody(); $kwdArr = unserialize($kwd); $crawler->run($kwdArr); # print_r($kwdObj); # $price = $detector->run($kwdObj); # if ($price['start_price'] && $price['end_price']) { # $sql = "SELECT * FROM price WHERE kid = {$kwdObj->id} LIMIT 1"; # $result = $mysqli->query($sql); # if ($result->num_rows) { # $sql = "UPDATE price SET min_price = '{$price['start_price']}', max_price = '{$price['end_price']}', region = '{$price['region']}', crawl_status = 2, last_update = " . time(). " WHERE kid = " . $kwdObj->id; # } # else { # $sql = "INSERT INTO price SET kid = {$kwdObj->id}, shop_type = '{$kwdObj->shop_type}', min_price = '{$price['start_price']}', max_price = '{$price['end_price']}', region = '{$price['region']}', crawl_status = 2, last_update = " . time(); # } # echo $sql . "\n"; # $mysqli->query($sql); # } }