Ejemplo n.º 1
0
    /**
     * Verify we check for recursive entity DOS
     *
     * (If the DOS isn't properly handled, the test runner will probably go OOM...)
     */
    public function testRecursiveEntity()
    {
        $xml = <<<'XML'
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE foo [
	<!ENTITY test "&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;">
	<!ENTITY a "&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;">
	<!ENTITY b "&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;">
	<!ENTITY c "&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;">
	<!ENTITY d "&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;">
	<!ENTITY e "&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;">
	<!ENTITY f "&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;">
	<!ENTITY g "-00000000000000000000000000000000000000000000000000000000000000000000000-">
]>
<foo>
<bar>&test;</bar>
</foo>
XML;
        $check = XmlTypeCheck::newFromString($xml);
        $this->assertFalse($check->wellFormed);
    }
Ejemplo n.º 2
0
 /**
  * Guess the mime type from the file contents.
  *
  * @param string $file
  * @param mixed $ext
  * @return bool|string
  */
 private function doGuessMimeType($file, $ext)
 {
     // TODO: remove $ext param
     // Read a chunk of the file
     wfSuppressWarnings();
     // @todo FIXME: Shouldn't this be rb?
     $f = fopen($file, 'rt');
     wfRestoreWarnings();
     if (!$f) {
         return 'unknown/unknown';
     }
     $head = fread($f, 1024);
     fseek($f, -65558, SEEK_END);
     $tail = fread($f, 65558);
     // 65558 = maximum size of a zip EOCDR
     fclose($f);
     wfDebug(__METHOD__ . ": analyzing head and tail of {$file} for magic numbers.\n");
     // Hardcode a few magic number checks...
     $headers = array('MThd' => 'audio/midi', 'OggS' => 'application/ogg', "\t" => 'application/x-msmetafile', "×ÍÆš" => 'application/x-msmetafile', '%PDF' => 'application/pdf', 'gimp xcf' => 'image/x-xcf', 'MZ' => 'application/octet-stream', "Êþº¾" => 'application/octet-stream', "ELF" => 'application/octet-stream');
     foreach ($headers as $magic => $candidate) {
         if (strncmp($head, $magic, strlen($magic)) == 0) {
             wfDebug(__METHOD__ . ": magic header in {$file} recognized as {$candidate}\n");
             return $candidate;
         }
     }
     /* Look for WebM and Matroska files */
     if (strncmp($head, pack("C4", 0x1a, 0x45, 0xdf, 0xa3), 4) == 0) {
         $doctype = strpos($head, "B‚");
         if ($doctype) {
             // Next byte is datasize, then data (sizes larger than 1 byte are very stupid muxers)
             $data = substr($head, $doctype + 3, 8);
             if (strncmp($data, "matroska", 8) == 0) {
                 wfDebug(__METHOD__ . ": recognized file as video/x-matroska\n");
                 return "video/x-matroska";
             } elseif (strncmp($data, "webm", 4) == 0) {
                 wfDebug(__METHOD__ . ": recognized file as video/webm\n");
                 return "video/webm";
             }
         }
         wfDebug(__METHOD__ . ": unknown EBML file\n");
         return "unknown/unknown";
     }
     /* Look for WebP */
     if (strncmp($head, "RIFF", 4) == 0 && strncmp(substr($head, 8, 8), "WEBPVP8 ", 8) == 0) {
         wfDebug(__METHOD__ . ": recognized file as image/webp\n");
         return "image/webp";
     }
     /**
      * Look for PHP.  Check for this before HTML/XML...  Warning: this is a
      * heuristic, and won't match a file with a lot of non-PHP before.  It
      * will also match text files which could be PHP. :)
      *
      * @todo FIXME: For this reason, the check is probably useless -- an attacker
      * could almost certainly just pad the file with a lot of nonsense to
      * circumvent the check in any case where it would be a security
      * problem.  On the other hand, it causes harmful false positives (bug
      * 16583).  The heuristic has been cut down to exclude three-character
      * strings like "<? ", but should it be axed completely?
      */
     if (strpos($head, '<?php') !== false || strpos($head, "<?php") !== false || strpos($head, "<? ") !== false || strpos($head, "<?\n") !== false || strpos($head, "<?\t") !== false || strpos($head, "<?=") !== false) {
         wfDebug(__METHOD__ . ": recognized {$file} as application/x-php\n");
         return 'application/x-php';
     }
     /**
      * look for XML formats (XHTML and SVG)
      */
     $xml = new XmlTypeCheck($file);
     if ($xml->wellFormed) {
         global $wgXMLMimeTypes;
         if (isset($wgXMLMimeTypes[$xml->getRootElement()])) {
             return $wgXMLMimeTypes[$xml->getRootElement()];
         } else {
             return 'application/xml';
         }
     }
     /**
      * look for shell scripts
      */
     $script_type = null;
     # detect by shebang
     if (substr($head, 0, 2) == "#!") {
         $script_type = "ASCII";
     } elseif (substr($head, 0, 5) == "#!") {
         $script_type = "UTF-8";
     } elseif (substr($head, 0, 7) == "þÿ#!") {
         $script_type = "UTF-16BE";
     } elseif (substr($head, 0, 7) == "ÿþ#!") {
         $script_type = "UTF-16LE";
     }
     if ($script_type) {
         if ($script_type !== "UTF-8" && $script_type !== "ASCII") {
             // Quick and dirty fold down to ASCII!
             $pack = array('UTF-16BE' => 'n*', 'UTF-16LE' => 'v*');
             $chars = unpack($pack[$script_type], substr($head, 2));
             $head = '';
             foreach ($chars as $codepoint) {
                 if ($codepoint < 128) {
                     $head .= chr($codepoint);
                 } else {
                     $head .= '?';
                 }
             }
         }
         $match = array();
         if (preg_match('%/?([^\\s]+/)(\\w+)%', $head, $match)) {
             $mime = "application/x-{$match[2]}";
             wfDebug(__METHOD__ . ": shell script recognized as {$mime}\n");
             return $mime;
         }
     }
     // Check for ZIP variants (before getimagesize)
     if (strpos($tail, "PK") !== false) {
         wfDebug(__METHOD__ . ": ZIP header present in {$file}\n");
         return $this->detectZipType($head, $tail, $ext);
     }
     wfSuppressWarnings();
     $gis = getimagesize($file);
     wfRestoreWarnings();
     if ($gis && isset($gis['mime'])) {
         $mime = $gis['mime'];
         wfDebug(__METHOD__ . ": getimagesize detected {$file} as {$mime}\n");
         return $mime;
     }
     // Also test DjVu
     $deja = new DjVuImage($file);
     if ($deja->isValid()) {
         wfDebug(__METHOD__ . ": detected {$file} as image/vnd.djvu\n");
         return 'image/vnd.djvu';
     }
     return false;
 }
Ejemplo n.º 3
0
 /**
  * @covers XMLTypeCheck::newFromString
  */
 public function testMalFormedXML()
 {
     $testXML = XmlTypeCheck::newFromString(self::MAL_FORMED_XML);
     $this->assertFalse($testXML->wellFormed);
 }
Ejemplo n.º 4
0
 function doGuessMimeType($file, $ext = true)
 {
     // Read a chunk of the file
     wfSuppressWarnings();
     $f = fopen($file, "rt");
     wfRestoreWarnings();
     if (!$f) {
         return "unknown/unknown";
     }
     $head = fread($f, 1024);
     fseek($f, -65558, SEEK_END);
     $tail = fread($f, 65558);
     // 65558 = maximum size of a zip EOCDR
     fclose($f);
     // Hardcode a few magic number checks...
     $headers = array('MThd' => 'audio/midi', 'OggS' => 'application/ogg', "\t" => 'application/x-msmetafile', "×ÍÆš" => 'application/x-msmetafile', '%PDF' => 'application/pdf', 'gimp xcf' => 'image/x-xcf', 'MZ' => 'application/octet-stream', "Êþº¾" => 'application/octet-stream', "ELF" => 'application/octet-stream');
     foreach ($headers as $magic => $candidate) {
         if (strncmp($head, $magic, strlen($magic)) == 0) {
             wfDebug(__METHOD__ . ": magic header in {$file} recognized as {$candidate}\n");
             return $candidate;
         }
     }
     /*
      * look for PHP
      * Check for this before HTML/XML...
      * Warning: this is a heuristic, and won't match a file with a lot of non-PHP before.
      * It will also match text files which could be PHP. :)
      */
     if (strpos($head, '<?php') !== false || strpos($head, '<? ') !== false || strpos($head, "<?\n") !== false || strpos($head, "<?\t") !== false || strpos($head, "<?=") !== false || strpos($head, "<?php") !== false || strpos($head, "<? ") !== false || strpos($head, "<?\n") !== false || strpos($head, "<?\t") !== false || strpos($head, "<?=") !== false) {
         wfDebug(__METHOD__ . ": recognized {$file} as application/x-php\n");
         return "application/x-php";
     }
     /*
      * look for XML formats (XHTML and SVG)
      */
     $xml = new XmlTypeCheck($file);
     if ($xml->wellFormed) {
         global $wgXMLMimeTypes;
         if (isset($wgXMLMimeTypes[$xml->getRootElement()])) {
             return $wgXMLMimeTypes[$xml->getRootElement()];
         } else {
             return 'application/xml';
         }
     }
     /*
      * look for shell scripts
      */
     $script_type = NULL;
     # detect by shebang
     if (substr($head, 0, 2) == "#!") {
         $script_type = "ASCII";
     } elseif (substr($head, 0, 5) == "#!") {
         $script_type = "UTF-8";
     } elseif (substr($head, 0, 7) == "þÿ#!") {
         $script_type = "UTF-16BE";
     } elseif (substr($head, 0, 7) == "ÿþ#!") {
         $script_type = "UTF-16LE";
     }
     if ($script_type) {
         if ($script_type !== "UTF-8" && $script_type !== "ASCII") {
             // Quick and dirty fold down to ASCII!
             $pack = array('UTF-16BE' => 'n*', 'UTF-16LE' => 'v*');
             $chars = unpack($pack[$script_type], substr($head, 2));
             $head = '';
             foreach ($chars as $codepoint) {
                 if ($codepoint < 128) {
                     $head .= chr($codepoint);
                 } else {
                     $head .= '?';
                 }
             }
         }
         $match = array();
         if (preg_match('%/?([^\\s]+/)(\\w+)%', $head, $match)) {
             $mime = "application/x-{$match[2]}";
             wfDebug(__METHOD__ . ": shell script recognized as {$mime}\n");
             return $mime;
         }
     }
     // Check for ZIP (before getimagesize)
     if (strpos($tail, "PK") !== false) {
         wfDebug(__METHOD__ . ": ZIP header present at end of {$file}\n");
         return $this->detectZipType($head);
     }
     wfSuppressWarnings();
     $gis = getimagesize($file);
     wfRestoreWarnings();
     if ($gis && isset($gis['mime'])) {
         $mime = $gis['mime'];
         wfDebug(__METHOD__ . ": getimagesize detected {$file} as {$mime}\n");
         return $mime;
     }
     // Also test DjVu
     $deja = new DjVuImage($file);
     if ($deja->isValid()) {
         wfDebug(__METHOD__ . ": detected {$file} as image/vnd.djvu\n");
         return 'image/vnd.djvu';
     }
     return false;
 }
Ejemplo n.º 5
0
 /**
  * Guess the MIME type from the file contents.
  *
  * @todo Remove $ext param
  *
  * @param string $file
  * @param mixed $ext
  * @return bool|string
  * @throws UnexpectedValueException
  */
 private function doGuessMimeType($file, $ext)
 {
     // Read a chunk of the file
     MediaWiki\suppressWarnings();
     $f = fopen($file, 'rb');
     MediaWiki\restoreWarnings();
     if (!$f) {
         return 'unknown/unknown';
     }
     $fsize = filesize($file);
     if ($fsize === false) {
         return 'unknown/unknown';
     }
     $head = fread($f, 1024);
     $tailLength = min(65558, $fsize);
     // 65558 = maximum size of a zip EOCDR
     if (fseek($f, -1 * $tailLength, SEEK_END) === -1) {
         throw new UnexpectedValueException("Seeking {$tailLength} bytes from EOF failed in " . __METHOD__);
     }
     $tail = $tailLength ? fread($f, $tailLength) : '';
     fclose($f);
     $this->logger->info(__METHOD__ . ": analyzing head and tail of {$file} for magic numbers.\n");
     // Hardcode a few magic number checks...
     $headers = ['MThd' => 'audio/midi', 'OggS' => 'application/ogg', "\t" => 'application/x-msmetafile', "×ÍÆš" => 'application/x-msmetafile', '%PDF' => 'application/pdf', 'gimp xcf' => 'image/x-xcf', 'MZ' => 'application/octet-stream', "Êþº¾" => 'application/octet-stream', "ELF" => 'application/octet-stream'];
     foreach ($headers as $magic => $candidate) {
         if (strncmp($head, $magic, strlen($magic)) == 0) {
             $this->logger->info(__METHOD__ . ": magic header in {$file} recognized as {$candidate}\n");
             return $candidate;
         }
     }
     /* Look for WebM and Matroska files */
     if (strncmp($head, pack("C4", 0x1a, 0x45, 0xdf, 0xa3), 4) == 0) {
         $doctype = strpos($head, "B‚");
         if ($doctype) {
             // Next byte is datasize, then data (sizes larger than 1 byte are stupid muxers)
             $data = substr($head, $doctype + 3, 8);
             if (strncmp($data, "matroska", 8) == 0) {
                 $this->logger->info(__METHOD__ . ": recognized file as video/x-matroska\n");
                 return "video/x-matroska";
             } elseif (strncmp($data, "webm", 4) == 0) {
                 $this->logger->info(__METHOD__ . ": recognized file as video/webm\n");
                 return "video/webm";
             }
         }
         $this->logger->info(__METHOD__ . ": unknown EBML file\n");
         return "unknown/unknown";
     }
     /* Look for WebP */
     if (strncmp($head, "RIFF", 4) == 0 && strncmp(substr($head, 8, 7), "WEBPVP8", 7) == 0) {
         $this->logger->info(__METHOD__ . ": recognized file as image/webp\n");
         return "image/webp";
     }
     /**
      * Look for PHP.  Check for this before HTML/XML...  Warning: this is a
      * heuristic, and won't match a file with a lot of non-PHP before.  It
      * will also match text files which could be PHP. :)
      *
      * @todo FIXME: For this reason, the check is probably useless -- an attacker
      * could almost certainly just pad the file with a lot of nonsense to
      * circumvent the check in any case where it would be a security
      * problem.  On the other hand, it causes harmful false positives (bug
      * 16583).  The heuristic has been cut down to exclude three-character
      * strings like "<? ", but should it be axed completely?
      */
     if (strpos($head, '<?php') !== false || strpos($head, "<?php") !== false || strpos($head, "<? ") !== false || strpos($head, "<?\n") !== false || strpos($head, "<?\t") !== false || strpos($head, "<?=") !== false) {
         $this->logger->info(__METHOD__ . ": recognized {$file} as application/x-php\n");
         return 'application/x-php';
     }
     /**
      * look for XML formats (XHTML and SVG)
      */
     $xml = new XmlTypeCheck($file);
     if ($xml->wellFormed) {
         $xmlTypes = $this->xmlTypes;
         if (isset($xmlTypes[$xml->getRootElement()])) {
             return $xmlTypes[$xml->getRootElement()];
         } else {
             return 'application/xml';
         }
     }
     /**
      * look for shell scripts
      */
     $script_type = null;
     # detect by shebang
     if (substr($head, 0, 2) == "#!") {
         $script_type = "ASCII";
     } elseif (substr($head, 0, 5) == "#!") {
         $script_type = "UTF-8";
     } elseif (substr($head, 0, 7) == "þÿ#!") {
         $script_type = "UTF-16BE";
     } elseif (substr($head, 0, 7) == "ÿþ#!") {
         $script_type = "UTF-16LE";
     }
     if ($script_type) {
         if ($script_type !== "UTF-8" && $script_type !== "ASCII") {
             // Quick and dirty fold down to ASCII!
             $pack = ['UTF-16BE' => 'n*', 'UTF-16LE' => 'v*'];
             $chars = unpack($pack[$script_type], substr($head, 2));
             $head = '';
             foreach ($chars as $codepoint) {
                 if ($codepoint < 128) {
                     $head .= chr($codepoint);
                 } else {
                     $head .= '?';
                 }
             }
         }
         $match = [];
         if (preg_match('%/?([^\\s]+/)(\\w+)%', $head, $match)) {
             $mime = "application/x-{$match[2]}";
             $this->logger->info(__METHOD__ . ": shell script recognized as {$mime}\n");
             return $mime;
         }
     }
     // Check for ZIP variants (before getimagesize)
     if (strpos($tail, "PK") !== false) {
         $this->logger->info(__METHOD__ . ": ZIP header present in {$file}\n");
         return $this->detectZipType($head, $tail, $ext);
     }
     MediaWiki\suppressWarnings();
     $gis = getimagesize($file);
     MediaWiki\restoreWarnings();
     if ($gis && isset($gis['mime'])) {
         $mime = $gis['mime'];
         $this->logger->info(__METHOD__ . ": getimagesize detected {$file} as {$mime}\n");
         return $mime;
     }
     # Media handling extensions can guess the MIME by content
     # It's intentionally here so that if core is wrong about a type (false positive),
     # people will hopefully nag and submit patches :)
     $mime = false;
     # Some strings by reference for performance - assuming well-behaved hooks
     $callback = $this->guessCallback;
     if ($callback) {
         $callback($this, $head, $tail, $file, $mime);
     }
     return $mime;
 }