/** * @testdox getRegexp() returns the regexp associated with this attribute preprocessor */ public function testGetRegexp() { $ap = new AttributePreprocessor('#(?<x>[a-z])#'); $this->assertSame('#(?<x>[a-z])#', $ap->getRegexp()); }
/** * Add the defined scrapes to given tag * * @param array $scrapes Scraping definitions * @return array Attributes created from scraped data */ protected function addScrapes(Tag $tag, array $scrapes) { // Ensure that the array is multidimensional if (!isset($scrapes[0])) { $scrapes = [$scrapes]; } $attributes = []; $scrapeConfig = []; foreach ($scrapes as $scrape) { // Collect the names of the attributes filled by this scrape. At runtime, we will // not scrape the content of the link if all of the attributes already have a value $attrNames = []; foreach ((array) $scrape['extract'] as $extractRegexp) { // Use an attribute preprocessor so we can reuse its routines $attributePreprocessor = new AttributePreprocessor($extractRegexp); foreach ($attributePreprocessor->getAttributes() as $attrName => $attrRegexp) { $attrNames[] = $attrName; $attributes[$attrName]['regexp'] = $attrRegexp; } } // Deduplicate and sort the attribute names so that they look tidy $attrNames = array_unique($attrNames); sort($attrNames); // Prepare the scrape config and add the URL if applicable if (!isset($scrape['match'])) { // No "match" regexp means that all URLs should be scraped. We do need an entry // so we use a regexp that matches anything $scrape['match'] = '//'; } $entry = [$scrape['match'], $scrape['extract'], $attrNames]; if (isset($scrape['url'])) { $entry[] = $scrape['url']; } // Add this scrape to the config $scrapeConfig[] = $entry; } // Add the scrape filter to this tag, execute it right before attributes are filtered, // which should be after attribute preprocessors are run. The offset is hardcoded here // for convenience (and because we know the filterChain is in its default state) and // since scraping is impossible in JavaScript without a PHP proxy, we just make it // return true in order to keep the tag valid $tag->filterChain->insert(1, __NAMESPACE__ . '\\Parser::scrape')->addParameterByName('scrapeConfig')->addParameterByName('cacheDir')->setVar('scrapeConfig', $scrapeConfig)->setJS('returnTrue'); return $attributes; }
protected function addScrapes(Tag $tag, array $scrapes) { if (!isset($scrapes[0])) { $scrapes = array($scrapes); } $attributes = array(); $scrapeConfig = array(); foreach ($scrapes as $scrape) { $attrNames = array(); foreach ((array) $scrape['extract'] as $extractRegexp) { $attributePreprocessor = new AttributePreprocessor($extractRegexp); foreach ($attributePreprocessor->getAttributes() as $attrName => $attrRegexp) { $attrNames[] = $attrName; $attributes[$attrName]['regexp'] = $attrRegexp; } } $attrNames = \array_unique($attrNames); \sort($attrNames); if (!isset($scrape['match'])) { $scrape['match'] = '//'; } $entry = array($scrape['match'], $scrape['extract'], $attrNames); if (isset($scrape['url'])) { $entry[] = $scrape['url']; } $scrapeConfig[] = $entry; } $tag->filterChain->insert(1, __NAMESPACE__ . '\\Parser::scrape')->addParameterByName('scrapeConfig')->addParameterByName('cacheDir')->setVar('scrapeConfig', $scrapeConfig)->setJS('returnTrue'); return $attributes; }