Ejemplo n.º 1
0
 /**
  * Build a set of regular expressions matching URLs with the list of regex fragments.
  * Returns an empty list if the input list is empty.
  *
  * @param array $lines list of fragments which will match in URLs
  * @param BaseBlacklist $blacklist
  * @param int $batchSize largest allowed batch regex;
  *                       if 0, will produce one regex per line
  * @return array
  */
 static function buildRegexes($lines, BaseBlacklist $blacklist, $batchSize = 4096)
 {
     # Make regex
     # It's faster using the S modifier even though it will usually only be run once
     //$regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')';
     //return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim';
     $regexes = array();
     $regexStart = $blacklist->getRegexStart();
     $regexEnd = $blacklist->getRegexEnd($batchSize);
     $build = false;
     foreach ($lines as $line) {
         if (substr($line, -1, 1) == "\\") {
             // Final \ will break silently on the batched regexes.
             // Skip it here to avoid breaking the next line;
             // warnings from getBadLines() will still trigger on
             // edit to keep new ones from floating in.
             continue;
         }
         // FIXME: not very robust size check, but should work. :)
         if ($build === false) {
             $build = $line;
         } elseif (strlen($build) + strlen($line) > $batchSize) {
             $regexes[] = $regexStart . str_replace('/', '\\/', preg_replace('|\\\\*/|u', '/', $build)) . $regexEnd;
             $build = $line;
         } else {
             $build .= '|';
             $build .= $line;
         }
     }
     if ($build !== false) {
         $regexes[] = $regexStart . str_replace('/', '\\/', preg_replace('|\\\\*/|u', '/', $build)) . $regexEnd;
     }
     return $regexes;
 }
Ejemplo n.º 2
0
 /**
  * Returns the end of the regex for matches
  *
  * @param $batchSize
  * @return string
  */
 public function getRegexEnd($batchSize)
 {
     return ')' . parent::getRegexEnd($batchSize);
 }