public function testScrape() { $parser = new \Seld\JsonLint\JsonParser(); $googleScraper = Builder::create($this->engines[0], array(array('foo', 'baz'), 'google')); $outDir = $googleScraper->getOutDir(); $this->assertFalse($googleScraper->scrape('bar')); $this->assertFalse($googleScraper->scrape('baz', 100)); $this->assertFalse($googleScraper->scrape('baz', 1, 'baz')); $this->assertFalse($googleScraper->scrape('baz', 1, true, 'foobad')); $this->assertFalse($googleScraper->scrape('baz', 1, true, 'UTC', 'faz')); $this->assertFalse($googleScraper->serialize('json')); $this->assertTrue($googleScraper->scrape('foo', 2, true, 'Europe/Berlin')); $this->assertCount(2, $googleScraper->getFetchedPages()); $this->assertCount(1, $googleScraper->getKeywords()); $this->assertTrue($googleScraper->scrape('baz', 2, true)); $this->assertCount(4, $googleScraper->getFetchedPages()); $this->assertCount(0, $googleScraper->getKeywords()); $this->assertFalse($googleScraper->scrapeAll()); $this->assertTrue($googleScraper->addKeywords(array('foobaz', 'foobar'))); $this->assertTrue($googleScraper->scrapeAll(2, true, 'America/Los_Angeles')); $this->assertCount(8, $googleScraper->getFetchedPages()); $this->assertCount(0, $googleScraper->getKeywords()); $this->assertFalse($googleScraper->serialize('baz')); $this->assertTrue($googleScraper->serialize('json', true)); $this->assertCount(0, $googleScraper->getFetchedPages()); $this->assertCount(8, $googleScraper->getSerializedPages()); $toCheck = array_map('Franzip\\SerpScraper\\Helpers\\FileSystemHelper::generateFileName', array_keys($googleScraper->getSerializedPages())); $this->assertTrue($googleScraper->save(true)); for ($i = 0; $i < count($toCheck); $i++) { $json = file_get_contents($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); $this->assertNull($parser->lint($json)); } $this->assertTrue($googleScraper->addKeywords(array('foo bad'))); $this->assertTrue($googleScraper->scrapeAll(3, true)); $this->assertCount(3, $googleScraper->getFetchedPages()); $this->assertTrue($googleScraper->serialize('xml', true)); $this->assertCount(0, $googleScraper->getFetchedPages()); $this->assertCount(3, $googleScraper->getSerializedPages()); $toCheck = array_map('Franzip\\SerpScraper\\Helpers\\FileSystemHelper::generateFileName', array_keys($googleScraper->getSerializedPages())); $this->assertTrue($googleScraper->save(true)); for ($i = 0; $i < count($toCheck); $i++) { $xml = new \XMLReader(); $xml->open($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); $xml->setParserProperty(\XMLReader::VALIDATE, true); $this->assertTrue($xml->isValid()); } $askScraper = Builder::create($this->engines[1], array(array('foo', 'baz'), 'ask')); $outDir = $askScraper->getOutDir(); $this->assertFalse($askScraper->scrape('bar')); $this->assertFalse($askScraper->scrape('baz', 100)); $this->assertFalse($askScraper->scrape('baz', 1, 'baz')); $this->assertFalse($askScraper->scrape('baz', 1, true, 'foobad')); $this->assertFalse($askScraper->scrape('baz', 1, true, 'UTC', 'faz')); $this->assertTrue($askScraper->scrape('foo', 2, true, 'Europe/Rome')); $this->assertCount(2, $askScraper->getFetchedPages()); $this->assertCount(1, $askScraper->getKeywords()); $this->assertTrue($askScraper->scrape('baz', 2, true)); $this->assertCount(4, $askScraper->getFetchedPages()); $this->assertCount(0, $askScraper->getKeywords()); $this->assertFalse($askScraper->scrapeAll()); $this->assertTrue($askScraper->addKeywords(array('foobaz', 'foobar'))); $this->assertTrue($askScraper->scrapeAll(2, true, 'America/Los_Angeles')); $this->assertCount(8, $askScraper->getFetchedPages()); $this->assertCount(0, $askScraper->getKeywords()); $this->assertFalse($askScraper->serialize('baz')); $this->assertTrue($askScraper->serialize('xml', true)); $this->assertCount(0, $askScraper->getFetchedPages()); $this->assertCount(8, $askScraper->getSerializedPages()); $toCheck = array_map('Franzip\\SerpScraper\\Helpers\\FileSystemHelper::generateFileName', array_keys($askScraper->getSerializedPages())); $this->assertTrue($askScraper->save(true)); $this->assertCount(0, $askScraper->getSerializedPages()); for ($i = 0; $i < count($toCheck); $i++) { $xml = new \XMLReader(); $xml->open($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); $xml->setParserProperty(\XMLReader::VALIDATE, true); $this->assertTrue($xml->isValid()); } $this->assertTrue($askScraper->addKeywords(array('foobaz'))); $this->assertTrue($askScraper->scrapeAll(3, true)); $this->assertTrue($askScraper->serialize('json', true)); $toCheck = array_map('Franzip\\SerpScraper\\Helpers\\FileSystemHelper::generateFileName', array_keys($askScraper->getSerializedPages())); $this->assertTrue($askScraper->save(true)); for ($i = 0; $i < count($toCheck); $i++) { $json = file_get_contents($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); $this->assertNull($parser->lint($json)); } $bingScraper = Builder::create($this->engines[2], array(array('foo', 'baz'), 'bing')); $outDir = $bingScraper->getOutDir(); $this->assertFalse($bingScraper->scrape('bar')); $this->assertFalse($bingScraper->scrape('baz', 100)); $this->assertFalse($bingScraper->scrape('baz', 1, 'baz')); $this->assertFalse($bingScraper->scrape('baz', 1, true, 'foobad')); $this->assertFalse($bingScraper->scrape('baz', 1, true, 'UTC', 'faz')); $this->assertFalse($bingScraper->serialize('json')); $this->assertTrue($bingScraper->scrape('foo', 2, true, 'Europe/Berlin')); $this->assertCount(2, $bingScraper->getFetchedPages()); $this->assertCount(1, $bingScraper->getKeywords()); $this->assertTrue($bingScraper->scrape('baz', 2, true)); $this->assertCount(4, $bingScraper->getFetchedPages()); $this->assertCount(0, $bingScraper->getKeywords()); $this->assertFalse($bingScraper->scrapeAll()); $this->assertTrue($bingScraper->addKeywords(array('foobaz', 'foobar'))); $this->assertTrue($bingScraper->scrapeAll(2, true, 'America/Los_Angeles')); $this->assertCount(8, $bingScraper->getFetchedPages()); $this->assertCount(0, $bingScraper->getKeywords()); $this->assertFalse($bingScraper->serialize('baz')); $this->assertTrue($bingScraper->serialize('json', true)); $this->assertCount(0, $bingScraper->getFetchedPages()); $this->assertCount(8, $bingScraper->getSerializedPages()); $toCheck = array_map('Franzip\\SerpScraper\\Helpers\\FileSystemHelper::generateFileName', array_keys($bingScraper->getSerializedPages())); $this->assertTrue($bingScraper->save(true)); for ($i = 0; $i < count($toCheck); $i++) { $json = file_get_contents($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); $this->assertNull($parser->lint($json)); } $this->assertTrue($bingScraper->addKeywords(array('foo bad'))); $this->assertTrue($bingScraper->scrapeAll(2, true)); $this->assertCount(2, $bingScraper->getFetchedPages()); $this->assertTrue($bingScraper->serialize('xml', true)); $this->assertCount(0, $bingScraper->getFetchedPages()); $this->assertCount(2, $bingScraper->getSerializedPages()); $toCheck = array_map('Franzip\\SerpScraper\\Helpers\\FileSystemHelper::generateFileName', array_keys($bingScraper->getSerializedPages())); $this->assertTrue($bingScraper->save(true)); for ($i = 0; $i < count($toCheck); $i++) { $xml = new \XMLReader(); $xml->open($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); $xml->setParserProperty(\XMLReader::VALIDATE, true); $this->assertTrue($xml->isValid()); } $yahooScraper = Builder::create($this->engines[3], array(array('foo', 'baz'), 'yahoo')); $outDir = $yahooScraper->getOutDir(); $this->assertFalse($yahooScraper->scrape('bar')); $this->assertFalse($yahooScraper->scrape('baz', 100)); $this->assertFalse($yahooScraper->scrape('baz', 1, 'baz')); $this->assertFalse($yahooScraper->scrape('baz', 1, true, 'foobad')); $this->assertFalse($yahooScraper->scrape('baz', 1, true, 'UTC', 'faz')); $this->assertTrue($yahooScraper->scrape('foo', 2, true, 'Europe/Rome')); $this->assertCount(2, $yahooScraper->getFetchedPages()); $this->assertCount(1, $yahooScraper->getKeywords()); $this->assertTrue($yahooScraper->scrape('baz', 2, true)); $this->assertCount(4, $yahooScraper->getFetchedPages()); $this->assertCount(0, $yahooScraper->getKeywords()); $this->assertFalse($yahooScraper->scrapeAll()); $this->assertTrue($yahooScraper->addKeywords(array('foobaz', 'foobar'))); $this->assertTrue($yahooScraper->scrapeAll(2, true, 'America/Los_Angeles')); $this->assertCount(8, $yahooScraper->getFetchedPages()); $this->assertCount(0, $yahooScraper->getKeywords()); $this->assertFalse($yahooScraper->serialize('baz')); $this->assertTrue($yahooScraper->serialize('xml', true)); $this->assertCount(0, $yahooScraper->getFetchedPages()); $this->assertCount(8, $yahooScraper->getSerializedPages()); $toCheck = array_map('Franzip\\SerpScraper\\Helpers\\FileSystemHelper::generateFileName', array_keys($yahooScraper->getSerializedPages())); $this->assertTrue($yahooScraper->save(true)); $this->assertCount(0, $yahooScraper->getSerializedPages()); for ($i = 0; $i < count($toCheck); $i++) { $xml = new \XMLReader(); $xml->open($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); $xml->setParserProperty(\XMLReader::VALIDATE, true); $this->assertTrue($xml->isValid()); } $this->assertTrue($yahooScraper->addKeywords(array('foobaz'))); $this->assertTrue($yahooScraper->scrapeAll(3, true)); $this->assertTrue($yahooScraper->serialize('json', true)); $toCheck = array_map('Franzip\\SerpScraper\\Helpers\\FileSystemHelper::generateFileName', array_keys($yahooScraper->getSerializedPages())); $this->assertTrue($yahooScraper->save(true)); for ($i = 0; $i < count($toCheck); $i++) { $json = file_get_contents($outDir . DIRECTORY_SEPARATOR . $toCheck[$i]); $this->assertNull($parser->lint($json)); } }
public function testYahooScraper() { $yahooScraper = Builder::create($this->engines[3], array(array('baz'))); $this->assertEquals(get_parent_class($yahooScraper), 'Franzip\\SerpScraper\\Scrapers\\SerpScraper'); $this->assertInstanceOf('Franzip\\SerpScraper\\Scrapers\\YahooScraper', $yahooScraper); $this->assertInstanceOf('Franzip\\Throttler\\Throttler', $yahooScraper->getThrottler()); $this->assertInstanceOf('Franzip\\SerpFetcher\\Fetchers\\YahooFetcher', $yahooScraper->getFetcher()); $this->assertTrue(file_exists($yahooScraper::DEFAULT_OUTPUT_DIR) && is_dir($yahooScraper::DEFAULT_OUTPUT_DIR)); $this->assertTrue(file_exists($yahooScraper::DEFAULT_FETCHER_CACHE_DIR) && is_dir($yahooScraper::DEFAULT_FETCHER_CACHE_DIR)); $this->assertTrue(file_exists($yahooScraper::DEFAULT_SERIALIZER_CACHE_DIR) && is_dir($yahooScraper::DEFAULT_SERIALIZER_CACHE_DIR)); }