/** * Compile imdb search results html into view/db * ready array. * * @param string $results * @return array */ public function compileSearchResults($results) { $crawler = new crawler($results); //once we have the curl result cleaned, we can loop trough it and filter out all the results //matching our query. $sections = $crawler->filter('table.results > tr.odd, table.results > tr.even'); //store title,type,year to check for duplicates $check = ''; $current = ''; //then we will loop trough every result and extract the information we require(title, plot, cast etc.) foreach ($sections as $k => $value) { $cr = new crawler($value); $title = $cr->filter('td.image > a')->extract('title'); $poster = $cr->filter('td.image > a > img')->extract('src'); $rating = $cr->filter('td.title > div.user_rating')->extract('_text'); $imdbid = $cr->filter('td.image > a')->extract('href'); $shortTitle = head($cr->filter('td.title > a')->extract('_text')); //set current title+year+type so we can spot duplicates $current = $shortTitle . $this->typeFromTitle($title) . $this->year($title); //if we already have such title+type+year increment year so we dont overwrite previous title if (strpos($check, $current) !== false) { continue; } $compiled[] = array('imdb_id' => $this->imdbid($imdbid), 'title' => $shortTitle, 'original_title' => $shortTitle, 'type' => $this->typeFromTitle($title), 'poster' => $this->posterSize($poster), 'year' => $this->year($title), 'plot' => head($cr->filter('td.title > span.outline')->extract('_text')), 'genre' => head($cr->filter('td.title > span.genre')->extract('_text')), 'imdb_rating' => $this->cleanRating($rating), 'runtime' => trim(head($cr->filter('span.runtime')->extract('_text')), ' mins.'), 'imdb_votes_num' => head($cr->filter('td.sort_col')->extract('_text'))); if (isset($compiled[$k])) { $check .= $compiled[$k]['title'] . $compiled[$k]['type'] . $compiled[$k]['year']; } } return isset($compiled) ? $compiled : array(); }
public function crawl() { if (!Input::has('url')) { return $this->layout->content = View::make('crawlurl'); } $startURL = Input::get("url"); $depth = Input::get("depth"); $crawler = new crawler($startURL, $depth); //$crawler->setHttpAuth($username, $password); $crawler->run(); $i = 0; echo sizeof($crawler->getSeen()); foreach ($crawler->getSeen() as $url => $seen) { $archemy = new Archemy($url, 1); if ($archemy->status()) { $archemy->parseRelations(); //array_merge($relations,); } echo $archemy->response(); //echo $url."<br>"; } $relations = Sentence::where("full_text", $startURL); $this->layout->content = View::make('archemy.show', array("response" => "", "relations" => $relations, "data" => $startURL)); }
<?php //This crawler searches through the links in a domain, //makes an array from them and then searches them for emails // see it in action at http://jacksworkspace.com set_time_limit(6400); $array = array(); $emailarray = array(); $crawler = new crawler($startURL, $depth); $crawler->run(); class crawler { protected $_url; protected $_depth; protected $_host; protected $_useHttpAuth = false; protected $_user; protected $_pass; protected $_seen = array(); protected $_filter = array(); public function __construct($url, $depth = 5) { $this->_url = $url; $this->_depth = $depth; $parse = parse_url($url); $this->_host = $parse['host']; } protected function _processAnchors($content, $url, $depth) { $dom = new DOMDocument('1.0'); @$dom->loadHTML($content);
<?php session_start(); if (isset($_GET['function'])) { $crawler = new crawler(); if ($_GET['function'] == 'init') { $crawler->init(); } else { if ($_GET['function'] == 'crawlerFetch') { $crawler->crawlerFetch(); } else { if ($_GET['function'] == 'crawlerProcess') { $crawler->crawlerProcess(); } } } } class crawler { private $funcs; private $mysql; function __construct() { require_once "funcs.php"; require_once "mysql.class.php"; $this->funcs = new funcs(); $this->mysql = new mySQL(); } function init() { echo "Creating database<br>";
/** * Compiles titles cast. * * @return array */ private function compileCast() { //get all the actor/char rows from imdb $raw = $this->crawler->filter('table.cast_list > tr.odd, table.cast_list > tr.even'); //foreach row extract image, id, actor name and actors character(s) foreach ($raw as $k => $v) { //skip parsing first row since its not actor $crawler = new crawler($v); //get actor name and image $actor = head($crawler->filter('.primary_photo > a > img')->extract(array('loadlate', 'title'))); //get actor id $actorid = Helpers::extract(head($crawler->filter('.primary_photo > a')->extract('href')), 'nm'); //get char $char = head($crawler->filter('.character')->extract('_text')); $char = $this->prettify($char); //push all data into cast array $cast[last($actor)] = array('name' => last($actor), 'image' => head($actor), 'char' => $char, 'imdb_id' => $actorid); } return isset($cast) ? $cast : array(); }
<?php header('Content-type: text/xml'); $xml = new SimpleXMLElement("<?xml version='1.0' encoding='utf-8'?" . "><crawler/>"); if (isset($_GET['url'])) { $crawler = new crawler(); if ($crawler->fetchWebpage($_GET['url'])) { //Fetchs the webpage. $xml->fetch = "true"; $xml->url = "http://" . preg_replace('/^http[s]?:(\\/)(\\/)/', "", $_GET['url']); $domain = parse_url("http://" . preg_replace('/^http[s]?:(\\/)(\\/)/', "", $_GET['url'])); $xml->domain = $domain['host']; $xml->path = preg_replace('/(\\/)[a-zA-z0-9\\.]+$/', "", preg_replace('/(\\/)$/', "", $domain['path'])); $xml->path = preg_replace('/(\\/([a-z0-9])+)(\\/\\.\\.)$/i', "", $xml->path); $crawler->fetchLinks($xml); //Fetchs all the links. $xml->links->addAttribute('count', count($xml->links[0])); $xml->out->addAttribute('count', count($xml->out[0])); unset($xml->out->link); $crawler->indexableData($xml); //Fetchs data for indexing. $xml->words->addAttribute('count', count($xml->words[0])); } else { $xml->fetch = "failed"; } } print $xml->asXML(); class crawler { private $funcs; private $mysql;
die('Connect Error (' . $mysqli->connect_errno . ') ' . $mysqli->connect_error); } for (;;) { $sql = "SELECT * FROM keyword WHERE id = 6 AND times > clicked_times ORDER BY last_click_time ASC LIMIT 1"; $result = $mysqli->query($sql); $data = array(); if ($result) { while ($obj = $result->fetch_object()) { $data[] = $obj; } } foreach ($data as $obj) { $kwd = $obj->kwd; $nid = $obj->nid; //$nid = '37770555506'; $crawler = new crawler($kwd, $nid); $proxy = $crawler->proxy; echo $proxy . "\n"; $url = $crawler->getPage(); echo $url . "\n"; $cmd = "/usr/bin/casperjs --proxy=" . $proxy . " /var/html/casperjs/tb.js \"" . $url . "\" " . $nid; echo $cmd . "\n"; system($cmd); $sql = "UPDATE keyword SET clicked_times = clicked_times + 1, last_click_time = " . time() . " WHERE id = " . $obj->id; $mysqli->query($sql); } } //$kwd = '负重绑腿'; //$nid = '36962206480'; //$crawler = new crawler($kwd, $nid); //$proxy = $crawler->proxy;
/** * Gets titles actor is know for. * * @return array. */ public function getKnownFor() { if (!$this->knownFor) { //grab all the titles actor is know for $known = $this->crawler->filter('div#knownfor > div'); //extract id, title, poster for each one and make multidim array from it foreach ($known as $k => $v) { $crawler = new crawler($v); $imdbid = $this->id($crawler->filter('a')->extract('href')); $title = head($crawler->filter('a > img')->extract('title')); $poster = $this->image($crawler->filter('a > img')->extract('src'), $imdbid); $year = Helpers::extractYear(head($crawler->filter('a')->eq(1)->extract('_text'))); $this->knownFor[] = array('imdb_id' => $imdbid, 'title' => $title, 'poster' => $poster, 'year' => $year); } } $this->images->saveMultiple($this->imgUrls, null, 'imdb/posters/'); return $this->knownFor; }
/** * Compile reviews into save ready array. * * @return void/array */ private function compileReviews() { $allReviews = $this->crawler->filter('ol.critic_reviews > li'); foreach ($allReviews as $k => $v) { $cr = new crawler($v); $compiled[] = array('source' => head($cr->filter('div.source')->extract(array('_text'))), 'author' => head($cr->filter('div.author > a')->extract(array('_text'))), 'body' => trim(head($cr->filter('div.review_body')->extract(array('_text')))), 'link' => head($cr->filter('a.external')->extract(array('href'))), 'score' => trim(head($cr->filter('div.review_grade')->extract(array('_text'))))); } return isset($compiled) ? $compiled : null; }
set_time_limit(0); date_default_timezone_set('Asia/Shanghai'); require_once dirname(__FILE__) . '/class.crawler.php'; require_once dirname(__FILE__) . '/class.proxy.php'; require_once dirname(__FILE__) . '/class.detector.php'; $mysqli = new mysqli('10.168.45.191', 'admin', 'txg19831210', 'crawler'); $mysqli->query('SET NAMES gbk'); $queueName = 'q_crawler'; $params = array('host' => '10.168.45.191', 'port' => 5672, 'login' => 'guest', 'password' => 'guest', 'vhost' => '/kwd'); $conn = new AMQPConnection($params); $conn->connect(); $channel = new AMQPChannel($conn); $queue = new AMQPQueue($channel); $queue->setName($queueName); $crawler = new crawler(); while ($message = $queue->get(AMQP_AUTOACK)) { $kwd = $message->getBody(); $kwdArr = unserialize($kwd); $crawler->run($kwdArr); # print_r($kwdObj); # $price = $detector->run($kwdObj); # if ($price['start_price'] && $price['end_price']) { # $sql = "SELECT * FROM price WHERE kid = {$kwdObj->id} LIMIT 1"; # $result = $mysqli->query($sql); # if ($result->num_rows) { # $sql = "UPDATE price SET min_price = '{$price['start_price']}', max_price = '{$price['end_price']}', region = '{$price['region']}', crawl_status = 2, last_update = " . time(). " WHERE kid = " . $kwdObj->id; # } # else { # $sql = "INSERT INTO price SET kid = {$kwdObj->id}, shop_type = '{$kwdObj->shop_type}', min_price = '{$price['start_price']}', max_price = '{$price['end_price']}', region = '{$price['region']}', crawl_status = 2, last_update = " . time(); # }