Esempio n. 1
0
 public function testFetchingMainMethod()
 {
     $askFetcher = Builder::create('Ask');
     $results = $askFetcher->fetch('http://us.ask.com/web?q=foo');
     $this->assertArrayHasKey('urls', $results);
     $this->assertArrayHasKey('titles', $results);
     $this->assertArrayHasKey('snippets', $results);
     $this->assertCount(10, $results['urls']);
     $this->assertCount(10, $results['titles']);
     $this->assertCount(10, $results['snippets']);
     $results = $askFetcher->fetch('http://us.ask.com/web?q=foo&page=2');
     $this->assertArrayHasKey('urls', $results);
     $this->assertArrayHasKey('titles', $results);
     $this->assertArrayHasKey('snippets', $results);
     $this->assertCount(10, $results['urls']);
     $this->assertCount(10, $results['titles']);
     $this->assertCount(10, $results['snippets']);
 }
Esempio n. 2
0
 public function testFetchingMainMethod()
 {
     $bingFetcher = Builder::create('Bing');
     $results = $bingFetcher->fetch('http://www.bing.com/search?q=foo');
     $this->assertArrayHasKey('urls', $results);
     $this->assertArrayHasKey('titles', $results);
     $this->assertArrayHasKey('snippets', $results);
     $this->assertCount(10, $results['urls']);
     $this->assertCount(10, $results['titles']);
     $this->assertCount(10, $results['snippets']);
     $results = $bingFetcher->fetch('http://www.bing.com/search?q=foo&first=10');
     $this->assertArrayHasKey('urls', $results);
     $this->assertArrayHasKey('titles', $results);
     $this->assertArrayHasKey('snippets', $results);
     $this->assertCount(10, $results['urls']);
     $this->assertCount(10, $results['titles']);
     $this->assertCount(10, $results['snippets']);
 }
Esempio n. 3
0
 public function testFetchingMainMethod()
 {
     $yahooFetcher = Builder::create('Yahoo');
     $results = $yahooFetcher->fetch('https://search.yahoo.com/search?p=foo');
     $this->assertArrayHasKey('urls', $results);
     $this->assertArrayHasKey('titles', $results);
     $this->assertArrayHasKey('snippets', $results);
     $this->assertCount(10, $results['urls']);
     $this->assertCount(10, $results['titles']);
     $this->assertCount(10, $results['snippets']);
     $results = $yahooFetcher->fetch('https://search.yahoo.com/search?p=foo&b=10');
     $this->assertArrayHasKey('urls', $results);
     $this->assertArrayHasKey('titles', $results);
     $this->assertArrayHasKey('snippets', $results);
     $this->assertCount(10, $results['urls']);
     $this->assertCount(10, $results['titles']);
     $this->assertCount(10, $results['snippets']);
 }
Esempio n. 4
0
 public function testFetchingMethods()
 {
     $googleFetcher = Builder::create($this->engines[0]);
     $fetchSerpContent = TestHelper::getMethod('fetchSerpContent', 'Google');
     $getSHDWrapper = TestHelper::getMethod('getSHDWrapper', 'Google');
     $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=foo"));
     $fetchedContent = $fetchSerpContent->invokeArgs($googleFetcher, array('http://www.google.com/search?q=foo'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=foo"));
     $SHDObject = $getSHDWrapper->invokeArgs($googleFetcher, array('http://www.google.com/search?q=foo'));
     $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=bar"));
     $fetchedContent = $fetchSerpContent->invokeArgs($googleFetcher, array('http://www.google.com/search?q=bar'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=bar"));
     $SHDObject = $getSHDWrapper->invokeArgs($googleFetcher, array('http://www.google.com/search?q=bar'));
     $googleFetcher->disableCaching();
     $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=foo"));
     $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=bar"));
     $this->assertTrue($googleFetcher->enableCaching());
     $this->assertTrue($googleFetcher->enableCachingForever());
     $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=foo"));
     $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=bar"));
     $this->assertTrue($googleFetcher->setCacheDir('baz'));
     $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=baz"));
     $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=foobar"));
     $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=bar"));
     $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=foo"));
     $fetchedContent = $fetchSerpContent->invokeArgs($googleFetcher, array('http://www.google.com/search?q=baz'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=baz"));
     $SHDObject = $getSHDWrapper->invokeArgs($googleFetcher, array('http://www.google.com/search?q=baz'));
     $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=foobar"));
     $fetchedContent = $fetchSerpContent->invokeArgs($googleFetcher, array('http://www.google.com/search?q=foobar'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=foobar"));
     $SHDObject = $getSHDWrapper->invokeArgs($googleFetcher, array('http://www.google.com/search?q=foobar'));
     $this->assertTrue($googleFetcher->setCacheDir('cache'));
     $this->assertTrue($googleFetcher->cacheHit('http://www.google.com/search?q=foo'));
     $this->assertTrue($googleFetcher->cacheHit('http://www.google.com/search?q=bar'));
     $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=baz"));
     $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=foobar"));
     $this->assertTrue($googleFetcher->setCacheDir('baz'));
     $this->assertFalse($googleFetcher->cacheHit('http://www.google.com/search?q=foo'));
     $this->assertFalse($googleFetcher->cacheHit('http://www.google.com/search?q=bar'));
     $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=baz"));
     $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=foobar"));
     $askFetcher = Builder::create($this->engines[1], array('bar' . DIRECTORY_SEPARATOR . 'foo', 1, false));
     $fetchSerpContent = TestHelper::getMethod('fetchSerpContent', 'Ask');
     $getSHDWrapper = TestHelper::getMethod('getSHDWrapper', 'Ask');
     $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foo"));
     $fetchedContent = $fetchSerpContent->invokeArgs($askFetcher, array('http://us.ask.com/web?q=foo'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foo"));
     $this->assertTrue($askFetcher->enableCaching());
     $SHDObject = $getSHDWrapper->invokeArgs($askFetcher, array('http://us.ask.com/web?q=foo'));
     $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=foo"));
     $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=bar"));
     $fetchedContent = $fetchSerpContent->invokeArgs($askFetcher, array('http://us.ask.com/web?q=bar'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=bar"));
     $SHDObject = $getSHDWrapper->invokeArgs($askFetcher, array('http://us.ask.com/web?q=bar'));
     $askFetcher->disableCaching();
     $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foo"));
     $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=bar"));
     $this->assertTrue($askFetcher->enableCaching());
     $this->assertTrue($askFetcher->enableCachingForever());
     $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=foo"));
     $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=bar"));
     $this->assertTrue($askFetcher->setCacheDir('barfoo'));
     $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foo"));
     $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=bar"));
     $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foobar"));
     $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=baz"));
     $fetchedContent = $fetchSerpContent->invokeArgs($askFetcher, array('http://us.ask.com/web?q=baz'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=baz"));
     $SHDObject = $getSHDWrapper->invokeArgs($askFetcher, array('http://us.ask.com/web?q=baz'));
     $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foobar"));
     $fetchedContent = $fetchSerpContent->invokeArgs($askFetcher, array('http://us.ask.com/web?q=foobar'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=foobar"));
     $SHDObject = $getSHDWrapper->invokeArgs($askFetcher, array('http://us.ask.com/web?q=foobar'));
     $this->assertTrue($askFetcher->setCacheDir('bar' . DIRECTORY_SEPARATOR . 'foo'));
     $this->assertTrue($askFetcher->cacheHit('http://us.ask.com/web?q=foo'));
     $this->assertTrue($askFetcher->cacheHit('http://us.ask.com/web?q=bar'));
     $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=baz"));
     $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foobar"));
     $this->assertTrue($askFetcher->setCacheDir('barfoo'));
     $this->assertFalse($askFetcher->cacheHit('http://us.ask.com/web?q=foo'));
     $this->assertFalse($askFetcher->cacheHit('http://us.ask.com/web?q=bar'));
     $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=baz"));
     $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=foobar"));
     $bingFetcher = Builder::create($this->engines[2], array('foobar', 48, true, true, 'UTF-16'));
     $fetchSerpContent = TestHelper::getMethod('fetchSerpContent', 'Bing');
     $getSHDWrapper = TestHelper::getMethod('getSHDWrapper', 'Bing');
     $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=foo"));
     $fetchedContent = $fetchSerpContent->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=foo'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=foo"));
     $SHDObject = $getSHDWrapper->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=foo'));
     $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=bar"));
     $fetchedContent = $fetchSerpContent->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=bar'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=bar"));
     $SHDObject = $getSHDWrapper->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=bar'));
     $bingFetcher->disableCaching();
     $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=foo"));
     $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=bar"));
     $this->assertTrue($bingFetcher->enableCaching());
     $this->assertTrue($bingFetcher->enableCachingForever());
     $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=foo"));
     $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=bar"));
     $this->assertTrue($bingFetcher->setCacheDir('barz'));
     $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=baz"));
     $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=foobar"));
     $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=bar"));
     $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=foo"));
     $fetchedContent = $fetchSerpContent->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=baz'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=baz"));
     $SHDObject = $getSHDWrapper->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=baz'));
     $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=foobar"));
     $fetchedContent = $fetchSerpContent->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=foobar'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=foobar"));
     $SHDObject = $getSHDWrapper->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=foobar'));
     $this->assertTrue($bingFetcher->setCacheDir('foobar'));
     $this->assertTrue($bingFetcher->cacheHit('http://www.bing.com/search?q=foo'));
     $this->assertTrue($bingFetcher->cacheHit('http://www.bing.com/search?q=bar'));
     $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=baz"));
     $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=foobar"));
     $this->assertTrue($bingFetcher->setCacheDir('barz'));
     $this->assertFalse($bingFetcher->cacheHit('http://www.bing.com/search?q=foo'));
     $this->assertFalse($bingFetcher->cacheHit('http://www.bing.com/search?q=bar'));
     $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=baz"));
     $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=foobar"));
     $yahooFetcher = Builder::create($this->engines[3], array('fubar', 48, true, true, 'UTF-16'));
     $fetchSerpContent = TestHelper::getMethod('fetchSerpContent', 'Yahoo');
     $getSHDWrapper = TestHelper::getMethod('getSHDWrapper', 'Yahoo');
     $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foo"));
     $fetchedContent = $fetchSerpContent->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=foo'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foo"));
     $SHDObject = $getSHDWrapper->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=foo'));
     $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=bar"));
     $fetchedContent = $fetchSerpContent->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=bar'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=bar"));
     $SHDObject = $getSHDWrapper->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=bar'));
     $yahooFetcher->disableCaching();
     $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foo"));
     $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=bar"));
     $this->assertTrue($yahooFetcher->enableCaching());
     $this->assertTrue($yahooFetcher->enableCachingForever());
     $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foo"));
     $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=bar"));
     $this->assertTrue($yahooFetcher->setCacheDir('fubarz'));
     $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=baz"));
     $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foobar"));
     $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=bar"));
     $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foo"));
     $fetchedContent = $fetchSerpContent->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=baz'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=baz"));
     $SHDObject = $getSHDWrapper->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=baz'));
     $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foobar"));
     $fetchedContent = $fetchSerpContent->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=foobar'));
     $this->assertRegExp('/^<!doctype html/i', $fetchedContent);
     $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foobar"));
     $SHDObject = $getSHDWrapper->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=foobar'));
     $this->assertTrue($yahooFetcher->setCacheDir('fubar'));
     $this->assertTrue($yahooFetcher->cacheHit('https://search.yahoo.com/search?q=foo'));
     $this->assertTrue($yahooFetcher->cacheHit('https://search.yahoo.com/search?q=bar'));
     $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=baz"));
     $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foobar"));
     $this->assertTrue($yahooFetcher->setCacheDir('fubarz'));
     $this->assertFalse($yahooFetcher->cacheHit('https://search.yahoo.com/search?q=foo'));
     $this->assertFalse($yahooFetcher->cacheHit('https://search.yahoo.com/search?q=bar'));
     $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=baz"));
     $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foobar"));
 }
 public function testFactoryWithArgs()
 {
     $googleFetcher = Builder::create($this->engines[0], array('baz', 48, true, true, 'UTF-16'));
     $this->assertEquals($googleFetcher->getCacheDir(), 'baz');
     $this->assertEquals($googleFetcher->getCacheTTL(), 48);
     $this->assertEquals($googleFetcher->getCharset(), 'UTF-16');
     $this->assertTrue($googleFetcher->isCaching());
     $this->assertTrue($googleFetcher->isCachingForever());
     $askFetcher = Builder::create($this->engines[1], array('bar' . DIRECTORY_SEPARATOR . 'foo', 1, false));
     $this->assertEquals($askFetcher->getCacheDir(), 'bar' . DIRECTORY_SEPARATOR . 'foo');
     $this->assertEquals($askFetcher->getCacheTTL(), 1);
     $this->assertEquals($askFetcher->getCharset(), 'UTF-8');
     $this->assertFalse($askFetcher->isCaching());
     $this->assertFalse($askFetcher->isCachingForever());
     $bingFetcher = Builder::create($this->engines[2], array('foo'));
     $this->assertEquals($bingFetcher->getCacheDir(), 'foo');
     $this->assertEquals($bingFetcher->getCacheTTL(), 24);
     $this->assertEquals($bingFetcher->getCharset(), 'UTF-8');
     $this->assertTrue($bingFetcher->isCaching());
     $this->assertFalse($bingFetcher->isCachingForever());
     $yahooFetcher = Builder::create($this->engines[3], array('foo'));
     $this->assertEquals($yahooFetcher->getCacheDir(), 'foo');
     $this->assertEquals($yahooFetcher->getCacheTTL(), 24);
     $this->assertEquals($yahooFetcher->getCharset(), 'UTF-8');
     $this->assertTrue($yahooFetcher->isCaching());
     $this->assertFalse($yahooFetcher->isCachingForever());
 }
Esempio n. 6
0
 public function testFetchingMainMethod()
 {
     $googleFetcher = Builder::create('Google');
     $results = $googleFetcher->fetch('http://www.google.com/search?q=foo');
     $this->assertArrayHasKey('urls', $results);
     $this->assertArrayHasKey('titles', $results);
     $this->assertArrayHasKey('snippets', $results);
     $this->assertCount(10, $results['urls']);
     $this->assertCount(10, $results['titles']);
     $this->assertCount(10, $results['snippets']);
 }
Esempio n. 7
0
 /**
  * Create a SerpScraper object.
  * @param array  $keywords
  * @param string $outDir
  * @param string $fetcherCacheDir
  * @param string $serializerCacheDir
  * @param int    $cacheTTL
  * @param int    $requestDelay
  */
 public function __construct($keywords, $outDir = self::DEFAULT_OUTPUT_DIR, $fetcherCacheDir = self::DEFAULT_FETCHER_CACHE_DIR, $serializerCacheDir = self::DEFAULT_SERIALIZER_CACHE_DIR, $cacheTTL = self::DEFAULT_FETCHER_CACHE_TTL, $requestDelay = self::DEFAULT_REQUEST_DELAY)
 {
     // perform validation
     SerpScraperHelper::checkArgs($keywords, $outDir, $fetcherCacheDir, $serializerCacheDir, $cacheTTL, $requestDelay);
     // instance variables
     $this->outDir = $outDir;
     $this->fetcherCacheDir = $fetcherCacheDir;
     $this->serializerCacheDir = $serializerCacheDir;
     $this->cacheTTL = $cacheTTL;
     $this->requestDelay = $requestDelay;
     $this->keywords = array();
     $this->fetched = array();
     $this->serialized = array();
     // normalize user input keywords
     for ($i = 0; $i < count($keywords); $i++) {
         array_push($this->keywords, KeywordValidator::processKeyword($keywords[$i]));
     }
     // set up folders
     FileSystemHelper::setUpDir($outDir);
     FileSystemHelper::setUpDir($serializerCacheDir);
     // deps injection
     $this->throttler = new Throttler(self::DEFAULT_THROTTLER_NAME, self::DEFAULT_THROTTLER_THRESHOLD, self::DEFAULT_THROTTLER_METRIC, self::DEFAULT_THROTTLER_METRIC_FACTOR, self::DEFAULT_THROTTLER_COMPONENT_THRESHOLD, $this->keywords);
     // turn on throttling
     $this->throttler->start();
     // instatiate the right fetcher at runtime (will also setup fetcher cache dir)
     $this->fetcher = SerpFetcherBuilder::create(self::runTimeClassName(), array($this->fetcherCacheDir, $this->cacheTTL));
     $this->serializer = new SerpPageSerializer($serializerCacheDir);
 }