public function testFetchingMainMethod() { $askFetcher = Builder::create('Ask'); $results = $askFetcher->fetch('http://us.ask.com/web?q=foo'); $this->assertArrayHasKey('urls', $results); $this->assertArrayHasKey('titles', $results); $this->assertArrayHasKey('snippets', $results); $this->assertCount(10, $results['urls']); $this->assertCount(10, $results['titles']); $this->assertCount(10, $results['snippets']); $results = $askFetcher->fetch('http://us.ask.com/web?q=foo&page=2'); $this->assertArrayHasKey('urls', $results); $this->assertArrayHasKey('titles', $results); $this->assertArrayHasKey('snippets', $results); $this->assertCount(10, $results['urls']); $this->assertCount(10, $results['titles']); $this->assertCount(10, $results['snippets']); }
public function testFetchingMainMethod() { $bingFetcher = Builder::create('Bing'); $results = $bingFetcher->fetch('http://www.bing.com/search?q=foo'); $this->assertArrayHasKey('urls', $results); $this->assertArrayHasKey('titles', $results); $this->assertArrayHasKey('snippets', $results); $this->assertCount(10, $results['urls']); $this->assertCount(10, $results['titles']); $this->assertCount(10, $results['snippets']); $results = $bingFetcher->fetch('http://www.bing.com/search?q=foo&first=10'); $this->assertArrayHasKey('urls', $results); $this->assertArrayHasKey('titles', $results); $this->assertArrayHasKey('snippets', $results); $this->assertCount(10, $results['urls']); $this->assertCount(10, $results['titles']); $this->assertCount(10, $results['snippets']); }
public function testFetchingMainMethod() { $yahooFetcher = Builder::create('Yahoo'); $results = $yahooFetcher->fetch('https://search.yahoo.com/search?p=foo'); $this->assertArrayHasKey('urls', $results); $this->assertArrayHasKey('titles', $results); $this->assertArrayHasKey('snippets', $results); $this->assertCount(10, $results['urls']); $this->assertCount(10, $results['titles']); $this->assertCount(10, $results['snippets']); $results = $yahooFetcher->fetch('https://search.yahoo.com/search?p=foo&b=10'); $this->assertArrayHasKey('urls', $results); $this->assertArrayHasKey('titles', $results); $this->assertArrayHasKey('snippets', $results); $this->assertCount(10, $results['urls']); $this->assertCount(10, $results['titles']); $this->assertCount(10, $results['snippets']); }
public function testFetchingMethods() { $googleFetcher = Builder::create($this->engines[0]); $fetchSerpContent = TestHelper::getMethod('fetchSerpContent', 'Google'); $getSHDWrapper = TestHelper::getMethod('getSHDWrapper', 'Google'); $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=foo")); $fetchedContent = $fetchSerpContent->invokeArgs($googleFetcher, array('http://www.google.com/search?q=foo')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=foo")); $SHDObject = $getSHDWrapper->invokeArgs($googleFetcher, array('http://www.google.com/search?q=foo')); $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=bar")); $fetchedContent = $fetchSerpContent->invokeArgs($googleFetcher, array('http://www.google.com/search?q=bar')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=bar")); $SHDObject = $getSHDWrapper->invokeArgs($googleFetcher, array('http://www.google.com/search?q=bar')); $googleFetcher->disableCaching(); $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=foo")); $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=bar")); $this->assertTrue($googleFetcher->enableCaching()); $this->assertTrue($googleFetcher->enableCachingForever()); $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=foo")); $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=bar")); $this->assertTrue($googleFetcher->setCacheDir('baz')); $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=baz")); $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=foobar")); $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=bar")); $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=foo")); $fetchedContent = $fetchSerpContent->invokeArgs($googleFetcher, array('http://www.google.com/search?q=baz')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=baz")); $SHDObject = $getSHDWrapper->invokeArgs($googleFetcher, array('http://www.google.com/search?q=baz')); $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=foobar")); $fetchedContent = $fetchSerpContent->invokeArgs($googleFetcher, array('http://www.google.com/search?q=foobar')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=foobar")); $SHDObject = $getSHDWrapper->invokeArgs($googleFetcher, array('http://www.google.com/search?q=foobar')); $this->assertTrue($googleFetcher->setCacheDir('cache')); $this->assertTrue($googleFetcher->cacheHit('http://www.google.com/search?q=foo')); $this->assertTrue($googleFetcher->cacheHit('http://www.google.com/search?q=bar')); $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=baz")); $this->assertFalse($googleFetcher->cacheHit("http://www.google.com/search?q=foobar")); $this->assertTrue($googleFetcher->setCacheDir('baz')); $this->assertFalse($googleFetcher->cacheHit('http://www.google.com/search?q=foo')); $this->assertFalse($googleFetcher->cacheHit('http://www.google.com/search?q=bar')); $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=baz")); $this->assertTrue($googleFetcher->cacheHit("http://www.google.com/search?q=foobar")); $askFetcher = Builder::create($this->engines[1], array('bar' . DIRECTORY_SEPARATOR . 'foo', 1, false)); $fetchSerpContent = TestHelper::getMethod('fetchSerpContent', 'Ask'); $getSHDWrapper = TestHelper::getMethod('getSHDWrapper', 'Ask'); $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foo")); $fetchedContent = $fetchSerpContent->invokeArgs($askFetcher, array('http://us.ask.com/web?q=foo')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foo")); $this->assertTrue($askFetcher->enableCaching()); $SHDObject = $getSHDWrapper->invokeArgs($askFetcher, array('http://us.ask.com/web?q=foo')); $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=foo")); $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=bar")); $fetchedContent = $fetchSerpContent->invokeArgs($askFetcher, array('http://us.ask.com/web?q=bar')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=bar")); $SHDObject = $getSHDWrapper->invokeArgs($askFetcher, array('http://us.ask.com/web?q=bar')); $askFetcher->disableCaching(); $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foo")); $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=bar")); $this->assertTrue($askFetcher->enableCaching()); $this->assertTrue($askFetcher->enableCachingForever()); $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=foo")); $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=bar")); $this->assertTrue($askFetcher->setCacheDir('barfoo')); $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foo")); $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=bar")); $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foobar")); $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=baz")); $fetchedContent = $fetchSerpContent->invokeArgs($askFetcher, array('http://us.ask.com/web?q=baz')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=baz")); $SHDObject = $getSHDWrapper->invokeArgs($askFetcher, array('http://us.ask.com/web?q=baz')); $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foobar")); $fetchedContent = $fetchSerpContent->invokeArgs($askFetcher, array('http://us.ask.com/web?q=foobar')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=foobar")); $SHDObject = $getSHDWrapper->invokeArgs($askFetcher, array('http://us.ask.com/web?q=foobar')); $this->assertTrue($askFetcher->setCacheDir('bar' . DIRECTORY_SEPARATOR . 'foo')); $this->assertTrue($askFetcher->cacheHit('http://us.ask.com/web?q=foo')); $this->assertTrue($askFetcher->cacheHit('http://us.ask.com/web?q=bar')); $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=baz")); $this->assertFalse($askFetcher->cacheHit("http://us.ask.com/web?q=foobar")); $this->assertTrue($askFetcher->setCacheDir('barfoo')); $this->assertFalse($askFetcher->cacheHit('http://us.ask.com/web?q=foo')); $this->assertFalse($askFetcher->cacheHit('http://us.ask.com/web?q=bar')); $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=baz")); $this->assertTrue($askFetcher->cacheHit("http://us.ask.com/web?q=foobar")); $bingFetcher = Builder::create($this->engines[2], array('foobar', 48, true, true, 'UTF-16')); $fetchSerpContent = TestHelper::getMethod('fetchSerpContent', 'Bing'); $getSHDWrapper = TestHelper::getMethod('getSHDWrapper', 'Bing'); $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=foo")); $fetchedContent = $fetchSerpContent->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=foo')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=foo")); $SHDObject = $getSHDWrapper->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=foo')); $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=bar")); $fetchedContent = $fetchSerpContent->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=bar')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=bar")); $SHDObject = $getSHDWrapper->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=bar')); $bingFetcher->disableCaching(); $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=foo")); $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=bar")); $this->assertTrue($bingFetcher->enableCaching()); $this->assertTrue($bingFetcher->enableCachingForever()); $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=foo")); $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=bar")); $this->assertTrue($bingFetcher->setCacheDir('barz')); $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=baz")); $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=foobar")); $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=bar")); $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=foo")); $fetchedContent = $fetchSerpContent->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=baz')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=baz")); $SHDObject = $getSHDWrapper->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=baz')); $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=foobar")); $fetchedContent = $fetchSerpContent->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=foobar')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=foobar")); $SHDObject = $getSHDWrapper->invokeArgs($bingFetcher, array('http://www.bing.com/search?q=foobar')); $this->assertTrue($bingFetcher->setCacheDir('foobar')); $this->assertTrue($bingFetcher->cacheHit('http://www.bing.com/search?q=foo')); $this->assertTrue($bingFetcher->cacheHit('http://www.bing.com/search?q=bar')); $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=baz")); $this->assertFalse($bingFetcher->cacheHit("http://www.bing.com/search?q=foobar")); $this->assertTrue($bingFetcher->setCacheDir('barz')); $this->assertFalse($bingFetcher->cacheHit('http://www.bing.com/search?q=foo')); $this->assertFalse($bingFetcher->cacheHit('http://www.bing.com/search?q=bar')); $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=baz")); $this->assertTrue($bingFetcher->cacheHit("http://www.bing.com/search?q=foobar")); $yahooFetcher = Builder::create($this->engines[3], array('fubar', 48, true, true, 'UTF-16')); $fetchSerpContent = TestHelper::getMethod('fetchSerpContent', 'Yahoo'); $getSHDWrapper = TestHelper::getMethod('getSHDWrapper', 'Yahoo'); $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foo")); $fetchedContent = $fetchSerpContent->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=foo')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foo")); $SHDObject = $getSHDWrapper->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=foo')); $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=bar")); $fetchedContent = $fetchSerpContent->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=bar')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=bar")); $SHDObject = $getSHDWrapper->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=bar')); $yahooFetcher->disableCaching(); $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foo")); $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=bar")); $this->assertTrue($yahooFetcher->enableCaching()); $this->assertTrue($yahooFetcher->enableCachingForever()); $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foo")); $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=bar")); $this->assertTrue($yahooFetcher->setCacheDir('fubarz')); $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=baz")); $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foobar")); $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=bar")); $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foo")); $fetchedContent = $fetchSerpContent->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=baz')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=baz")); $SHDObject = $getSHDWrapper->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=baz')); $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foobar")); $fetchedContent = $fetchSerpContent->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=foobar')); $this->assertRegExp('/^<!doctype html/i', $fetchedContent); $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foobar")); $SHDObject = $getSHDWrapper->invokeArgs($yahooFetcher, array('https://search.yahoo.com/search?q=foobar')); $this->assertTrue($yahooFetcher->setCacheDir('fubar')); $this->assertTrue($yahooFetcher->cacheHit('https://search.yahoo.com/search?q=foo')); $this->assertTrue($yahooFetcher->cacheHit('https://search.yahoo.com/search?q=bar')); $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=baz")); $this->assertFalse($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foobar")); $this->assertTrue($yahooFetcher->setCacheDir('fubarz')); $this->assertFalse($yahooFetcher->cacheHit('https://search.yahoo.com/search?q=foo')); $this->assertFalse($yahooFetcher->cacheHit('https://search.yahoo.com/search?q=bar')); $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=baz")); $this->assertTrue($yahooFetcher->cacheHit("https://search.yahoo.com/search?q=foobar")); }
public function testFactoryWithArgs() { $googleFetcher = Builder::create($this->engines[0], array('baz', 48, true, true, 'UTF-16')); $this->assertEquals($googleFetcher->getCacheDir(), 'baz'); $this->assertEquals($googleFetcher->getCacheTTL(), 48); $this->assertEquals($googleFetcher->getCharset(), 'UTF-16'); $this->assertTrue($googleFetcher->isCaching()); $this->assertTrue($googleFetcher->isCachingForever()); $askFetcher = Builder::create($this->engines[1], array('bar' . DIRECTORY_SEPARATOR . 'foo', 1, false)); $this->assertEquals($askFetcher->getCacheDir(), 'bar' . DIRECTORY_SEPARATOR . 'foo'); $this->assertEquals($askFetcher->getCacheTTL(), 1); $this->assertEquals($askFetcher->getCharset(), 'UTF-8'); $this->assertFalse($askFetcher->isCaching()); $this->assertFalse($askFetcher->isCachingForever()); $bingFetcher = Builder::create($this->engines[2], array('foo')); $this->assertEquals($bingFetcher->getCacheDir(), 'foo'); $this->assertEquals($bingFetcher->getCacheTTL(), 24); $this->assertEquals($bingFetcher->getCharset(), 'UTF-8'); $this->assertTrue($bingFetcher->isCaching()); $this->assertFalse($bingFetcher->isCachingForever()); $yahooFetcher = Builder::create($this->engines[3], array('foo')); $this->assertEquals($yahooFetcher->getCacheDir(), 'foo'); $this->assertEquals($yahooFetcher->getCacheTTL(), 24); $this->assertEquals($yahooFetcher->getCharset(), 'UTF-8'); $this->assertTrue($yahooFetcher->isCaching()); $this->assertFalse($yahooFetcher->isCachingForever()); }
public function testFetchingMainMethod() { $googleFetcher = Builder::create('Google'); $results = $googleFetcher->fetch('http://www.google.com/search?q=foo'); $this->assertArrayHasKey('urls', $results); $this->assertArrayHasKey('titles', $results); $this->assertArrayHasKey('snippets', $results); $this->assertCount(10, $results['urls']); $this->assertCount(10, $results['titles']); $this->assertCount(10, $results['snippets']); }
/** * Create a SerpScraper object. * @param array $keywords * @param string $outDir * @param string $fetcherCacheDir * @param string $serializerCacheDir * @param int $cacheTTL * @param int $requestDelay */ public function __construct($keywords, $outDir = self::DEFAULT_OUTPUT_DIR, $fetcherCacheDir = self::DEFAULT_FETCHER_CACHE_DIR, $serializerCacheDir = self::DEFAULT_SERIALIZER_CACHE_DIR, $cacheTTL = self::DEFAULT_FETCHER_CACHE_TTL, $requestDelay = self::DEFAULT_REQUEST_DELAY) { // perform validation SerpScraperHelper::checkArgs($keywords, $outDir, $fetcherCacheDir, $serializerCacheDir, $cacheTTL, $requestDelay); // instance variables $this->outDir = $outDir; $this->fetcherCacheDir = $fetcherCacheDir; $this->serializerCacheDir = $serializerCacheDir; $this->cacheTTL = $cacheTTL; $this->requestDelay = $requestDelay; $this->keywords = array(); $this->fetched = array(); $this->serialized = array(); // normalize user input keywords for ($i = 0; $i < count($keywords); $i++) { array_push($this->keywords, KeywordValidator::processKeyword($keywords[$i])); } // set up folders FileSystemHelper::setUpDir($outDir); FileSystemHelper::setUpDir($serializerCacheDir); // deps injection $this->throttler = new Throttler(self::DEFAULT_THROTTLER_NAME, self::DEFAULT_THROTTLER_THRESHOLD, self::DEFAULT_THROTTLER_METRIC, self::DEFAULT_THROTTLER_METRIC_FACTOR, self::DEFAULT_THROTTLER_COMPONENT_THRESHOLD, $this->keywords); // turn on throttling $this->throttler->start(); // instatiate the right fetcher at runtime (will also setup fetcher cache dir) $this->fetcher = SerpFetcherBuilder::create(self::runTimeClassName(), array($this->fetcherCacheDir, $this->cacheTTL)); $this->serializer = new SerpPageSerializer($serializerCacheDir); }