Example #1
0
 /**
  * Compile imdb search results html into view/db
  * ready array.
  * 
  * @param  string $results
  * @return array
  */
 public function compileSearchResults($results)
 {
     $crawler = new crawler($results);
     //once we have the curl result cleaned, we can loop trough it and filter out all the results
     //matching our query.
     $sections = $crawler->filter('table.results > tr.odd, table.results > tr.even');
     //store title,type,year to check for duplicates
     $check = '';
     $current = '';
     //then we will loop trough every result and extract the information we require(title, plot, cast etc.)
     foreach ($sections as $k => $value) {
         $cr = new crawler($value);
         $title = $cr->filter('td.image > a')->extract('title');
         $poster = $cr->filter('td.image > a > img')->extract('src');
         $rating = $cr->filter('td.title > div.user_rating')->extract('_text');
         $imdbid = $cr->filter('td.image > a')->extract('href');
         $shortTitle = head($cr->filter('td.title > a')->extract('_text'));
         //set current title+year+type so we can spot duplicates
         $current = $shortTitle . $this->typeFromTitle($title) . $this->year($title);
         //if we already have such title+type+year increment year so we dont overwrite previous title
         if (strpos($check, $current) !== false) {
             continue;
         }
         $compiled[] = array('imdb_id' => $this->imdbid($imdbid), 'title' => $shortTitle, 'original_title' => $shortTitle, 'type' => $this->typeFromTitle($title), 'poster' => $this->posterSize($poster), 'year' => $this->year($title), 'plot' => head($cr->filter('td.title > span.outline')->extract('_text')), 'genre' => head($cr->filter('td.title > span.genre')->extract('_text')), 'imdb_rating' => $this->cleanRating($rating), 'runtime' => trim(head($cr->filter('span.runtime')->extract('_text')), ' mins.'), 'imdb_votes_num' => head($cr->filter('td.sort_col')->extract('_text')));
         if (isset($compiled[$k])) {
             $check .= $compiled[$k]['title'] . $compiled[$k]['type'] . $compiled[$k]['year'];
         }
     }
     return isset($compiled) ? $compiled : array();
 }
 public function crawl()
 {
     if (!Input::has('url')) {
         return $this->layout->content = View::make('crawlurl');
     }
     $startURL = Input::get("url");
     $depth = Input::get("depth");
     $crawler = new crawler($startURL, $depth);
     //$crawler->setHttpAuth($username, $password);
     $crawler->run();
     $i = 0;
     echo sizeof($crawler->getSeen());
     foreach ($crawler->getSeen() as $url => $seen) {
         $archemy = new Archemy($url, 1);
         if ($archemy->status()) {
             $archemy->parseRelations();
             //array_merge($relations,);
         }
         echo $archemy->response();
         //echo $url."<br>";
     }
     $relations = Sentence::where("full_text", $startURL);
     $this->layout->content = View::make('archemy.show', array("response" => "", "relations" => $relations, "data" => $startURL));
 }
<?php

//This crawler searches through the links in a domain,
//makes an array from them and then searches them for emails
// see it in action at http://jacksworkspace.com
set_time_limit(6400);
$array = array();
$emailarray = array();
$crawler = new crawler($startURL, $depth);
$crawler->run();
class crawler
{
    protected $_url;
    protected $_depth;
    protected $_host;
    protected $_useHttpAuth = false;
    protected $_user;
    protected $_pass;
    protected $_seen = array();
    protected $_filter = array();
    public function __construct($url, $depth = 5)
    {
        $this->_url = $url;
        $this->_depth = $depth;
        $parse = parse_url($url);
        $this->_host = $parse['host'];
    }
    protected function _processAnchors($content, $url, $depth)
    {
        $dom = new DOMDocument('1.0');
        @$dom->loadHTML($content);
<?php

session_start();
if (isset($_GET['function'])) {
    $crawler = new crawler();
    if ($_GET['function'] == 'init') {
        $crawler->init();
    } else {
        if ($_GET['function'] == 'crawlerFetch') {
            $crawler->crawlerFetch();
        } else {
            if ($_GET['function'] == 'crawlerProcess') {
                $crawler->crawlerProcess();
            }
        }
    }
}
class crawler
{
    private $funcs;
    private $mysql;
    function __construct()
    {
        require_once "funcs.php";
        require_once "mysql.class.php";
        $this->funcs = new funcs();
        $this->mysql = new mySQL();
    }
    function init()
    {
        echo "Creating database<br>";
Example #5
0
 /**
  * Compiles titles cast.
  * 
  * @return array
  */
 private function compileCast()
 {
     //get all the actor/char rows from imdb
     $raw = $this->crawler->filter('table.cast_list > tr.odd, table.cast_list > tr.even');
     //foreach row extract image, id, actor name and actors character(s)
     foreach ($raw as $k => $v) {
         //skip parsing first row since its not actor
         $crawler = new crawler($v);
         //get actor name and image
         $actor = head($crawler->filter('.primary_photo > a > img')->extract(array('loadlate', 'title')));
         //get actor id
         $actorid = Helpers::extract(head($crawler->filter('.primary_photo > a')->extract('href')), 'nm');
         //get char
         $char = head($crawler->filter('.character')->extract('_text'));
         $char = $this->prettify($char);
         //push all data into cast array
         $cast[last($actor)] = array('name' => last($actor), 'image' => head($actor), 'char' => $char, 'imdb_id' => $actorid);
     }
     return isset($cast) ? $cast : array();
 }
Example #6
0
<?php

header('Content-type: text/xml');
$xml = new SimpleXMLElement("<?xml version='1.0' encoding='utf-8'?" . "><crawler/>");
if (isset($_GET['url'])) {
    $crawler = new crawler();
    if ($crawler->fetchWebpage($_GET['url'])) {
        //Fetchs the webpage.
        $xml->fetch = "true";
        $xml->url = "http://" . preg_replace('/^http[s]?:(\\/)(\\/)/', "", $_GET['url']);
        $domain = parse_url("http://" . preg_replace('/^http[s]?:(\\/)(\\/)/', "", $_GET['url']));
        $xml->domain = $domain['host'];
        $xml->path = preg_replace('/(\\/)[a-zA-z0-9\\.]+$/', "", preg_replace('/(\\/)$/', "", $domain['path']));
        $xml->path = preg_replace('/(\\/([a-z0-9])+)(\\/\\.\\.)$/i', "", $xml->path);
        $crawler->fetchLinks($xml);
        //Fetchs all the links.
        $xml->links->addAttribute('count', count($xml->links[0]));
        $xml->out->addAttribute('count', count($xml->out[0]));
        unset($xml->out->link);
        $crawler->indexableData($xml);
        //Fetchs data for indexing.
        $xml->words->addAttribute('count', count($xml->words[0]));
    } else {
        $xml->fetch = "failed";
    }
}
print $xml->asXML();
class crawler
{
    private $funcs;
    private $mysql;
Example #7
0
    die('Connect Error (' . $mysqli->connect_errno . ') ' . $mysqli->connect_error);
}
for (;;) {
    $sql = "SELECT * FROM keyword WHERE id = 6 AND times > clicked_times ORDER BY last_click_time ASC LIMIT 1";
    $result = $mysqli->query($sql);
    $data = array();
    if ($result) {
        while ($obj = $result->fetch_object()) {
            $data[] = $obj;
        }
    }
    foreach ($data as $obj) {
        $kwd = $obj->kwd;
        $nid = $obj->nid;
        //$nid = '37770555506';
        $crawler = new crawler($kwd, $nid);
        $proxy = $crawler->proxy;
        echo $proxy . "\n";
        $url = $crawler->getPage();
        echo $url . "\n";
        $cmd = "/usr/bin/casperjs --proxy=" . $proxy . " /var/html/casperjs/tb.js \"" . $url . "\" " . $nid;
        echo $cmd . "\n";
        system($cmd);
        $sql = "UPDATE keyword SET clicked_times = clicked_times + 1, last_click_time = " . time() . " WHERE id = " . $obj->id;
        $mysqli->query($sql);
    }
}
//$kwd = '负重绑腿';
//$nid = '36962206480';
//$crawler = new crawler($kwd, $nid);
//$proxy = $crawler->proxy;
Example #8
0
 /**
  * Gets titles actor is know for.
  * 
  * @return array.
  */
 public function getKnownFor()
 {
     if (!$this->knownFor) {
         //grab all the titles actor is know for
         $known = $this->crawler->filter('div#knownfor > div');
         //extract id, title, poster for each one and make multidim array from it
         foreach ($known as $k => $v) {
             $crawler = new crawler($v);
             $imdbid = $this->id($crawler->filter('a')->extract('href'));
             $title = head($crawler->filter('a > img')->extract('title'));
             $poster = $this->image($crawler->filter('a > img')->extract('src'), $imdbid);
             $year = Helpers::extractYear(head($crawler->filter('a')->eq(1)->extract('_text')));
             $this->knownFor[] = array('imdb_id' => $imdbid, 'title' => $title, 'poster' => $poster, 'year' => $year);
         }
     }
     $this->images->saveMultiple($this->imgUrls, null, 'imdb/posters/');
     return $this->knownFor;
 }
Example #9
0
 /**
  * Compile reviews into save ready array.
  * 
  * @return void/array
  */
 private function compileReviews()
 {
     $allReviews = $this->crawler->filter('ol.critic_reviews > li');
     foreach ($allReviews as $k => $v) {
         $cr = new crawler($v);
         $compiled[] = array('source' => head($cr->filter('div.source')->extract(array('_text'))), 'author' => head($cr->filter('div.author > a')->extract(array('_text'))), 'body' => trim(head($cr->filter('div.review_body')->extract(array('_text')))), 'link' => head($cr->filter('a.external')->extract(array('href'))), 'score' => trim(head($cr->filter('div.review_grade')->extract(array('_text')))));
     }
     return isset($compiled) ? $compiled : null;
 }
Example #10
0
set_time_limit(0);
date_default_timezone_set('Asia/Shanghai');
require_once dirname(__FILE__) . '/class.crawler.php';
require_once dirname(__FILE__) . '/class.proxy.php';
require_once dirname(__FILE__) . '/class.detector.php';
$mysqli = new mysqli('10.168.45.191', 'admin', 'txg19831210', 'crawler');
$mysqli->query('SET NAMES gbk');
$queueName = 'q_crawler';
$params = array('host' => '10.168.45.191', 'port' => 5672, 'login' => 'guest', 'password' => 'guest', 'vhost' => '/kwd');
$conn = new AMQPConnection($params);
$conn->connect();
$channel = new AMQPChannel($conn);
$queue = new AMQPQueue($channel);
$queue->setName($queueName);
$crawler = new crawler();
while ($message = $queue->get(AMQP_AUTOACK)) {
    $kwd = $message->getBody();
    $kwdArr = unserialize($kwd);
    $crawler->run($kwdArr);
    #    print_r($kwdObj);
    #    $price = $detector->run($kwdObj);
    #    if ($price['start_price'] && $price['end_price']) {
    #        $sql = "SELECT * FROM price WHERE kid = {$kwdObj->id} LIMIT 1";
    #        $result = $mysqli->query($sql);
    #        if ($result->num_rows) {
    #            $sql = "UPDATE price SET min_price = '{$price['start_price']}', max_price = '{$price['end_price']}', region = '{$price['region']}', crawl_status = 2, last_update = " . time(). " WHERE kid = " . $kwdObj->id;
    #        }
    #        else {
    #            $sql = "INSERT INTO price SET kid = {$kwdObj->id}, shop_type = '{$kwdObj->shop_type}', min_price = '{$price['start_price']}', max_price = '{$price['end_price']}', region = '{$price['region']}', crawl_status = 2, last_update = " . time();
    #        }