/** * Build a set of regular expressions matching URLs with the list of regex fragments. * Returns an empty list if the input list is empty. * * @param array $lines list of fragments which will match in URLs * @param BaseBlacklist $blacklist * @param int $batchSize largest allowed batch regex; * if 0, will produce one regex per line * @return array */ static function buildRegexes($lines, BaseBlacklist $blacklist, $batchSize = 4096) { # Make regex # It's faster using the S modifier even though it will usually only be run once //$regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')'; //return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim'; $regexes = array(); $regexStart = $blacklist->getRegexStart(); $regexEnd = $blacklist->getRegexEnd($batchSize); $build = false; foreach ($lines as $line) { if (substr($line, -1, 1) == "\\") { // Final \ will break silently on the batched regexes. // Skip it here to avoid breaking the next line; // warnings from getBadLines() will still trigger on // edit to keep new ones from floating in. continue; } // FIXME: not very robust size check, but should work. :) if ($build === false) { $build = $line; } elseif (strlen($build) + strlen($line) > $batchSize) { $regexes[] = $regexStart . str_replace('/', '\\/', preg_replace('|\\\\*/|u', '/', $build)) . $regexEnd; $build = $line; } else { $build .= '|'; $build .= $line; } } if ($build !== false) { $regexes[] = $regexStart . str_replace('/', '\\/', preg_replace('|\\\\*/|u', '/', $build)) . $regexEnd; } return $regexes; }
/** * Returns the end of the regex for matches * * @param $batchSize * @return string */ public function getRegexEnd($batchSize) { return ')' . parent::getRegexEnd($batchSize); }