public function go() { $start_url = $this->url; $c = new Crawler($start_url); $c->go2linewhere('<p><a href="'); $c->close(); $ledak = explode('<a href="', $c->curline); for ($i = 1; $i < count($ledak); ++$i) { $aurl = Crawler::cutuntil($ledak[$i], '"'); $aurl = str_replace('http://hentaifromhell.net/redirect.html?', '', $aurl); echo "<a href='{$aurl}'>{$aurl}</a><br />\n"; flush(); /* $basename = Crawler::cutuntillast($aurl, '/'); if (!in_array($basename, $this->blacklist)) { $c = new Crawler($aurl); $c->go2linewhere('id="thepic"'); $imgurl = $c->getbetween('SRC="', '"'); $c->close(); echo "<a href='$basename/$imgurl'>".Crawler::n($i,3).".jpg</a><br />\n"; flush(); } else { echo "$i blacklisted server<br/>";flush(); } */ } }
public function getHotSpots() { $crawler = new Crawler($this); $outlines = new CrawlerOutlineCollection(); $size = $this->image->size(); for ($x = 0; $x < $size[0]; $x++) { for ($y = 0; $y < $size[1]; $y++) { $pixel = $this->pixel($x, $y); // Skip white pixels if ($pixel->color()->compare(ImageColor::white(), 5)) { continue; } // Skip crawled areas if ($outlines->contains($pixel)) { continue; } // Start crawling $outline = $crawler->crawl($x, $y); $outlines->push($outline); } } $hotspots = new ImageCollection(); foreach ($outlines as $outline) { $hotspots->push($this->image->sliceByOutline($outline)); } return array($hotspots, $outlines); }
function crawl_1_page($url) { echo "URL2 {$url} <br/>\n"; flush(); $dirname = html_entity_decode(Crawler::cutfromlast1(substr($url, 0, strlen($url) - 1), '/')); $hasil = array(); $c = new Crawler($url); $c->go_to('<div class="entry">'); while ($line = $c->readline()) { if (Crawler::is_there($line, "href='")) { $img = Crawler::extract($line, "href='", "'"); echo "<a href='{$img}'>{$dirname}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, 'href="')) { $img = Crawler::extract($line, 'href="', '"'); echo "<a href='{$img}'>{$dirname}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, '</div>')) { break; } } } } $c->close(); }
/** * Log the user * @param Crawler $crawler */ private function login($crawler) { $form = $crawler->selectButton('_submit')->form(); // définit certaines valeurs $form['_username'] = '******'; $form['_password'] = '******'; return $this->client->submit($form); }
public function testCrawl() { $account = Account::login("searchzen.org", "test"); $c = $account->collections[0]; $crawler = new Crawler($c); $crawler->pageLimit = 10; $crawler->start(); }
/** * Start the crawler to retrieve pages from a given news website * @param type $nrOfDaysBack The nr of days the crawler should go back (counting from today) * @param type $newsSiteUrl The root URL of the news site (the seed of the crawler) * @return type */ public function crawlForNews($nrOfDaysBack, $newsSiteUrl, $timeToLive, $startDate = null) { $crawler = new Crawler($newsSiteUrl, $timeToLive); if ($startDate) { $crawler->crawl($nrOfDaysBack, $startDate); } else { $crawler->crawl($nrOfDaysBack); } return count($crawler->getCrawled()); }
static function run() { if (isset($_GET['site_url']) && isset($_GET['sitemap_url']) && CODOF\Access\CSRF::valid($_GET['CSRF_token'])) { $sitemapObject = new Crawler($_GET['site_url']); $sitemapPath = ABSPATH . 'sitemap.xml'; $sitemapFile = $sitemapObject->createSitemap($sitemapPath); // session_write_close(); // ob_end_flush(); exit; } }
function crawl_indowebster($url) { //echo "'$url'"; $craw = new Crawler($url); $craw->go2lineregexor('/(<\\/div><\\/a><\\/div><\\/div>)/', 1, 'href="#idws7"'); $setring = $craw->getbetween('location.href=\'', '\''); $path = Crawler::extract($setring, 'path=', '&'); $file_orig = Crawler::cutafter($setring, 'file_orig='); $craw->close(); return '<a href="' . dirname($setring) . '/' . $path . '">' . rawurldecode($file_orig) . '</a>'; }
public function mangareader_1_page($fil, $url, $prefix, $chapter) { $chapter = Crawler::pad($chapter, 3); $c = new Crawler($fil); $c->go_to('width="800"'); $img = $c->getbetween('src="', '"'); preg_match('/(\\d+\\.\\w+)$/', basename($img), $m); $iname = $m[1]; $c->close(); $name = $prefix . '-' . $chapter . '-' . $iname; return array($name => $img); }
public function addCrawlMap($src, $patterns) { if (!empty($src)) { $root = $this->_src_root . '/' . $src; } else { $root = $this->_src_root; } $crawler = new Crawler($root, $patterns); $paths = $crawler->getPaths(); foreach ($paths as $path) { $this->addMap($src . '/' . $path, str_replace('site/', '', $path)); } }
public function mangareader_1_page($fil, $url, $chapter) { $prefix = $this->prefix; $chapter = Crawler::pad($chapter, 3); $c = new Crawler($fil); $c->go_to('width="800"'); $img = $c->getbetween('src="', '"'); // if (@$_GET['show_url']) echo "<a href='$url'>URL</a> "; preg_match('/(\\d+\\.\\w+)$/', basename($img), $m); $iname = $m[1]; echo '<li><a href="' . $img . '">' . $prefix . '-' . $chapter . '-' . $iname . '</a>' . "</li>\n"; $c->close(); }
public function handleCrawling() { $view = new CrawlerResultViewSwe(); if (isset($_SESSION['url'])) { $url = $_SESSION['url']; $curl = new Curl(); $crawler = new Crawler($curl, $url); } //If user wants to book if ($view->bookParamExists() && isset($crawler)) { $group1 = $view->getBookInfo(); $reservationInfo = $crawler->getReservations($group1); $view->outPutReservationResult($reservationInfo); //destroy session $_SESSION = array(); session_destroy(); session_unset(); } else { if ($view->timeParamExists() && $view->dayParamExists() && $view->movieParamExists() && isset($crawler)) { $day = $view->getDay(); $time = $view->getTime(); $movie = $view->getMovie(); $dinnerInfo = $crawler->getDinnerInfo($day, $time); $view->outPutDinnerAlts($dinnerInfo, $time, $movie); } else { if ($view->userHasSubmittedURL()) { //$url ="http://localhost:8080/"; $url = $view->getURL(); $curl = new Curl(); $crawler = new Crawler($curl, $url); $crawler->getLinks(); $dates = $crawler->getCalendarInfo(); $comparer = new compareData(); $matchingDay = $comparer->compareCommonDates($dates); if (!is_null($matchingDay)) { $filmDates = $crawler->getFilmInfo($matchingDay); $view->outPutMovieResult($filmDates, $matchingDay); } else { $view->noMatchingDays(); } $_SESSION['url'] = $url; } else { $view = new formView(); } } } return $view; }
public function testFlickrCrawl() { $builders = $this->buildData(); $crawler = Crawler::getInstance(); $config = Config::getInstance(); //use fake Flickr API key $plugin_builder = FixtureBuilder::build('plugins', array('id'=>'2', 'folder_name'=>'flickrthumbnails')); $option_builder = FixtureBuilder::build('options', array( 'namespace' => OptionDAO::PLUGIN_OPTIONS . '-2', 'option_name' => 'flickr_api_key', 'option_value' => 'dummykey') ); //$config->setValue('flickr_api_key', 'dummykey'); $this->simulateLogin('*****@*****.**', true); $crawler->crawl(); $ldao = DAOFactory::getDAO('LinkDAO'); $link = $ldao->getLinkById(43); $this->assertEqual($link->expanded_url, 'http://farm3.static.flickr.com/2755/4488149974_04d9558212_m.jpg'); $this->assertEqual($link->error, ''); $link = $ldao->getLinkById(42); $this->assertEqual($link->expanded_url, ''); $this->assertEqual($link->error, 'No response from Flickr API'); $link = $ldao->getLinkById(41); $this->assertEqual($link->expanded_url, ''); $this->assertEqual($link->error, 'No response from Flickr API'); }
/** * Destroy Config, Webapp, $_SESSION, $_POST, $_GET, $_REQUEST */ public function tearDown() { Config::destroyInstance(); Webapp::destroyInstance(); Crawler::destroyInstance(); if (isset($_SESSION)) { $this->unsetArray($_SESSION); } $this->unsetArray($_POST); $this->unsetArray($_GET); $this->unsetArray($_REQUEST); $this->unsetArray($_SERVER); $this->unsetArray($_FILES); Loader::unregister(); $backup_dir = FileDataManager::getBackupPath(); if (file_exists($backup_dir)) { try { @exec('cd ' . $backup_dir . '; rm -rf *'); rmdir($backup_dir); // won't delete if has files } catch (Exception $e) { } } $data_dir = FileDataManager::getDataPath(); if (file_exists($data_dir . 'compiled_view')) { try { @exec('cd ' . $data_dir . '; rm -rf compiled_view'); } catch (Exception $e) { } } parent::tearDown(); }
public function control() { $output = ""; $authorized = false; if (isset($this->argc) && $this->argc > 1) { // check for CLI credentials $session = new Session(); $username = $this->argv[1]; if ($this->argc > 2) { $pw = $this->argv[2]; } else { $pw = getenv('THINKUP_PASSWORD'); } $owner_dao = DAOFactory::getDAO('OwnerDAO'); $owner = $owner_dao->getByEmail($username); if ($owner_dao->isOwnerAuthorized($username, $pw)) { $authorized = true; Session::completeLogin($owner); } else { $output = "ERROR: Incorrect username and password."; } } else { // check user is logged in on the web if ($this->isLoggedIn()) { $authorized = true; } else { $output = "ERROR: Invalid or missing username and password."; } } if ($authorized) { $crawler = Crawler::getInstance(); $crawler->crawl(); } return $output; }
public static function getInstance($seconds = 15) { if (empty(self::$instance)) { $class = __CLASS__; self::$instance = new $class($seconds); } return self::$instance; }
function perform() { $ps = DB::prepare('INSERT INTO listings SET scraped=FALSE, code=:code, title=:title, link=:link, date=:date, price=:price, neighborhood=:neighborhood'); array_map(function ($url) use($ps) { $code = substr($url, 30, 3); // SUPER brittle obvs $crawler = new Crawler(Guzzle::get($url)->getBody()); $crawler->filter('.row > .txt')->each(function ($node) use($ps, $code) { try { $a = $node->filter('.pl > a.hdrlnk'); $ps->execute([':code' => $code, ':title' => $a->text(), ':link' => $a->attr('href'), ':date' => strftime('%Y-%m-%d', strtotime($node->filter('.pl > .date')->text())), ':price' => ($n = $node->filter('.l2 > .price')) && $n->count() ? preg_replace('/\\D/', '', $n->text()) : null, ':neighborhood' => ($n = $node->filter('.l2 > .pnr > small')) && $n->count() ? $n->text() : null]); } catch (Exception $e) { Logger::error($e->getMessage(), $ps->errorinfo()); } }); }, ['http://newyork.craigslist.org/nfa/', 'http://newyork.craigslist.org/roo/', 'http://newyork.craigslist.org/sub/']); }
public function testDevices() { $lines = file(__DIR__ . '/devices.txt'); foreach ($lines as $line) { $test = Crawler::isCrawler($line); $this->assertEquals($test, false, $line); } }
public function testDevices() { $lines = file('https://raw.githubusercontent.com/JayBizzle/Crawler-Detect/master/tests/devices.txt'); foreach ($lines as $line) { $test = Crawler::isCrawler($line); $this->assertEquals($test, false, $line); } }
function crawl_1_chapter($url, $chapter) { global $sitename; global $prefix; $c = new Crawler($url); $c->go_to('name="pagejump"'); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = Crawler::extract($line, 'value="', '"'); } else { if (Crawler::is_there($line, '</select>')) { break; } } } $c->go_to('id="nextpage"'); $c->readline(); $img = $c->getbetween('src="', '"'); $c->close(); $img_base = dirname($img); $ext = '.jpg'; $chapter = Crawler::pad($chapter, 3); foreach ($pages as $page) { echo "<a href='{$img_base}/{$page}{$ext}'>{$prefix}-{$chapter}-{$page}{$ext}</a><br/>\n"; flush(); } //print_r($pages);flush(); }
public function crawl_page($url) { // crawl_page $c = new Crawler($url); // get title $c->go_to('<title>'); $title = Crawler::extract($c->curline, 'PHD Comics: ', '</title>'); $title = preg_replace('/\\W/', '_', $title); // get the date $c->go_to('date_left.gif'); $c->readline(2); $line = $c->curline; preg_match('/([0-9]+)\\/([0-9]+)\\/([0-9]+)/mi', $line, $matches); //print_r($matches);flush(); list($full, $month, $date, $year) = $matches; if (strlen($date) < 2) { $date = '0' . $date; } if (strlen($month) < 2) { $month = '0' . $month; } $fileprefix = "{$year}_{$month}_{$date}_{$title}"; // get the img url $c->go2linewhere('<td bgcolor=#FFFFFF'); $line = $c->curline; preg_match('/<img src=["\']?([^ ]+)["\']?/i', $line, $matches); $img = $matches[1]; $filename = basename($img); $ext = substr($filename, strrpos($filename, '.')); echo "<a href='{$img}'>" . $fileprefix . $ext . "</a><br/>"; flush(); $c->close(); unset($c); }
public function go() { $start_url = $this->url; if (preg_match('/gallery1\\.hentaifromhell\\.net/', $start_url)) { $base_url = 'http://gallery1.hentaifromhell.net'; } else { $base_url = 'http://gallery.hentaifromhell.net'; } $finish = false; while (!$finish) { $finish = true; echo $start_url, "<br />\n"; flush(); $c = new Crawler($start_url); $c->go2linewhere('<li class="thumbnail">'); while ($line = $c->readline()) { //echo "<pre>$line</pre><br/>\n";flush(); if (strpos($line, 'src="') !== false) { //ambil gambar $uri = Crawler::extract($line, 'src="', '"'); $uri = str_replace('/thumbs/', '/images/', $uri); preg_match('/(\\/small\\/\\d+-)/', $uri, $matches); $uri = str_replace($matches[1], '/', $uri); //$uri = html_entity_decode($uri); //$this->extract_page($uri); $file = basename(dirname($uri)); echo "<a href='{$uri}'>{$file}</a><br/>\n"; flush(); } else { if (strpos($line, 'class="pagNext"') !== false) { //next page $finish = false; $start_url = html_entity_decode(Crawler::extract($line, 'class="pagNext" href="', '"')); break; } else { if (strpos($line, '</table>') !== false) { // selesai break; } } } } $c->close(); } }
public static function filterLink($link, $pageUrl, Crawler $crawler) { if (Util::isRelative($link)) { return self::getUrlEnd($link) !== self::getUrlEnd($pageUrl) && !in_array(self::getUrlEnd($link), $crawler->getCrawledUrlEnds(), true); } $parseLink = parse_url($link); $parsePage = parse_url($pageUrl); if (!isset($parseLink['host'])) { return false; } $path = $link; if (isset($parseLink['path'])) { $path = $parseLink['path']; } $Linkhost = Util::getSignificantHostPart($parseLink['host']); $PageHost = Util::getSignificantHostPart($parsePage['host']); return $Linkhost === $PageHost && $crawler->getRobots()->checkAllowed($path); }
public function setUp() { parent::setUp(); $this->webapp = Webapp::getInstance(); $this->crawler = Crawler::getInstance(); $this->webapp->registerPlugin('twitter', 'TwitterPlugin'); $this->crawler->registerCrawlerPlugin('TwitterPlugin'); $this->webapp->setActivePlugin('twitter'); $this->logger = Logger::getInstance(); }
function crawl_1_chapter($url, $chapter) { global $sitename; global $prefix; // http://ani-haven.net/hr-alpha/Psyren/145/ // @todo $chapter = Crawler::pad($chapter, 3); $c = new Crawler($url); $c->go_to('id="myselectbox3"'); $c->readline(); $pages = Crawler::extract_to_array($c->curline, 'value="', '"'); $c->close(); // append $url ke $pages foreach ($pages as $i => $page) { $pages[$i] = $url . $page; } Crawler::multiProcess(4, $pages, 'crawl_1_page', array($chapter)); }
function perform() { $q = DB::query('SELECT link, neighborhood FROM listings WHERE scraped != TRUE', PDO::FETCH_ASSOC); $ps = DB::prepare('UPDATE listings SET scraped=TRUE, street=:street, description=:description, lat=:lat, lng=:lng WHERE link=:link'); /* Guzzle::sendAll(array_map(function ($listing) { return Guzzle::createRequest('GET', 'http://newyork.craigslist.org' . $listing['link']); }, iterator_to_array($q)), ['complete' => function ($event) use($ps) { try { $body = $event->getResponse()->getBody(); $crawler = new Crawler($body); $readability = new Readability($body); $street = $crawler->filter('.mapAndAttrs > .mapbox > div.mapaddress'); $ps->execute([ ':link' => parse_url($event->getRequest()->getUrl())['path'], ':lat' => null, ':lng' => null, ':street' => $street->count() ? $street->text() : null, ':description' => $readability->init() ? trim(strip_tags(tidy_parse_string($readability->getContent()->innerHTML, [], 'UTF8'))) : null ]); } catch (Exception $e) { Logger::error($e->getMessage(), $ps->errorinfo()); } }]); */ foreach ($q as $listing) { try { $body = Guzzle::get('http://newyork.craigslist.org' . $listing['link'])->getBody(); $crawler = new Crawler($body); $readability = new Readability($body); $street = $crawler->filter('.mapAndAttrs > .mapbox > div.mapaddress'); $url = 'http://maps.googleapis.com/maps/api/geocode/json?address=' . ($street->count() ? $street->text() : $listing['neighborhood']); $json = json_decode(Guzzle::get($url)->getBody(), true); $loc = isset($json['results'][0]) ? $json['results'][0]['geometry']['location'] : null; $ps->execute([':link' => $listing['link'], ':lat' => isset($loc['lat']) ? $loc['lat'] : null, ':lng' => isset($loc['lng']) ? $loc['lng'] : null, ':street' => $street->count() ? $street->text() : null, ':description' => $readability->init() ? trim(strip_tags(tidy_parse_string($readability->getContent()->innerHTML, [], 'UTF8'))) : null]); } catch (Exception $e) { Logger::error($e->getMessage(), $ps->errorinfo()); } } }
/** * Tear down * Destroys Config, Webapp, and Session objects * @TODO Destroy all SESSION variables * @TODO Destroy all REQUEST/GET/POST variables */ function tearDown() { Config::destroyInstance(); Webapp::destroyInstance(); Crawler::destroyInstance(); if (isset($_SESSION['user'])) { $_SESSION['user'] = null; } parent::tearDown(); }
/** * Parsing * * @throws Exception */ public function crawl($url) { $crawler = new Crawler(); $crawler->on($crawler::EVENT_HIT_CRAWL, function ($href, DOMDocument $dom) { $start = microtime(true); $imgLength = $dom->getElementsByTagName('img')->length; $time = microtime(true) - $start; $processTime = sprintf('%.6F', $time); $this->report[] = ['href' => $href, 'imgLength' => $imgLength, 'processTime' => $processTime]; $this->show(' - ' . $href . ' [img: ' . $imgLength . ']' . PHP_EOL); }); $crawler->on($crawler::EVENT_BEFORE_CRAWL, function () { $this->show('Start crawl' . PHP_EOL); }); $crawler->on($crawler::EVENT_AFTER_CRAWL, function () { $this->show('Finish crawl' . PHP_EOL); }); $crawler->crawl($url); }
public function testCrawlUnauthorized() { $builders = $this->buildData(); $crawler = Crawler::getInstance(); $crawler->registerPlugin('hellothinkup', 'HelloThinkUpPlugin'); $crawler->registerCrawlerPlugin('HelloThinkUpPlugin'); $this->expectException(new UnauthorizedUserException('You need a valid session to launch the crawler.')); $crawler->crawl(); $this->assertNoErrors(); }
function testExpandURLsCrawl() { $crawler = Crawler::getInstance(); $crawler->crawl(); //the crawler closes the log so we have to re-open it $logger = Logger::getInstance(); $ldao = DAOFactory::getDAO('LinkDAO'); $link = $ldao->getLinkById(1); $this->assertEqual($link->expanded_url, 'http://www.thewashingtonnote.com/archives/2010/04/communications/'); $this->assertEqual($link->error, ''); }