/** * Verify we check for recursive entity DOS * * (If the DOS isn't properly handled, the test runner will probably go OOM...) */ public function testRecursiveEntity() { $xml = <<<'XML' <?xml version="1.0" encoding="utf-8"?> <!DOCTYPE foo [ <!ENTITY test "&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;"> <!ENTITY a "&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;"> <!ENTITY b "&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;"> <!ENTITY c "&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;"> <!ENTITY d "&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;"> <!ENTITY e "&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;"> <!ENTITY f "&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;"> <!ENTITY g "-00000000000000000000000000000000000000000000000000000000000000000000000-"> ]> <foo> <bar>&test;</bar> </foo> XML; $check = XmlTypeCheck::newFromString($xml); $this->assertFalse($check->wellFormed); }
/** * Guess the mime type from the file contents. * * @param string $file * @param mixed $ext * @return bool|string */ private function doGuessMimeType($file, $ext) { // TODO: remove $ext param // Read a chunk of the file wfSuppressWarnings(); // @todo FIXME: Shouldn't this be rb? $f = fopen($file, 'rt'); wfRestoreWarnings(); if (!$f) { return 'unknown/unknown'; } $head = fread($f, 1024); fseek($f, -65558, SEEK_END); $tail = fread($f, 65558); // 65558 = maximum size of a zip EOCDR fclose($f); wfDebug(__METHOD__ . ": analyzing head and tail of {$file} for magic numbers.\n"); // Hardcode a few magic number checks... $headers = array('MThd' => 'audio/midi', 'OggS' => 'application/ogg', "\t" => 'application/x-msmetafile', "×ÍÆš" => 'application/x-msmetafile', '%PDF' => 'application/pdf', 'gimp xcf' => 'image/x-xcf', 'MZ' => 'application/octet-stream', "Êþº¾" => 'application/octet-stream', "ELF" => 'application/octet-stream'); foreach ($headers as $magic => $candidate) { if (strncmp($head, $magic, strlen($magic)) == 0) { wfDebug(__METHOD__ . ": magic header in {$file} recognized as {$candidate}\n"); return $candidate; } } /* Look for WebM and Matroska files */ if (strncmp($head, pack("C4", 0x1a, 0x45, 0xdf, 0xa3), 4) == 0) { $doctype = strpos($head, "B‚"); if ($doctype) { // Next byte is datasize, then data (sizes larger than 1 byte are very stupid muxers) $data = substr($head, $doctype + 3, 8); if (strncmp($data, "matroska", 8) == 0) { wfDebug(__METHOD__ . ": recognized file as video/x-matroska\n"); return "video/x-matroska"; } elseif (strncmp($data, "webm", 4) == 0) { wfDebug(__METHOD__ . ": recognized file as video/webm\n"); return "video/webm"; } } wfDebug(__METHOD__ . ": unknown EBML file\n"); return "unknown/unknown"; } /* Look for WebP */ if (strncmp($head, "RIFF", 4) == 0 && strncmp(substr($head, 8, 8), "WEBPVP8 ", 8) == 0) { wfDebug(__METHOD__ . ": recognized file as image/webp\n"); return "image/webp"; } /** * Look for PHP. Check for this before HTML/XML... Warning: this is a * heuristic, and won't match a file with a lot of non-PHP before. It * will also match text files which could be PHP. :) * * @todo FIXME: For this reason, the check is probably useless -- an attacker * could almost certainly just pad the file with a lot of nonsense to * circumvent the check in any case where it would be a security * problem. On the other hand, it causes harmful false positives (bug * 16583). The heuristic has been cut down to exclude three-character * strings like "<? ", but should it be axed completely? */ if (strpos($head, '<?php') !== false || strpos($head, "<?php") !== false || strpos($head, "<? ") !== false || strpos($head, "<?\n") !== false || strpos($head, "<?\t") !== false || strpos($head, "<?=") !== false) { wfDebug(__METHOD__ . ": recognized {$file} as application/x-php\n"); return 'application/x-php'; } /** * look for XML formats (XHTML and SVG) */ $xml = new XmlTypeCheck($file); if ($xml->wellFormed) { global $wgXMLMimeTypes; if (isset($wgXMLMimeTypes[$xml->getRootElement()])) { return $wgXMLMimeTypes[$xml->getRootElement()]; } else { return 'application/xml'; } } /** * look for shell scripts */ $script_type = null; # detect by shebang if (substr($head, 0, 2) == "#!") { $script_type = "ASCII"; } elseif (substr($head, 0, 5) == "#!") { $script_type = "UTF-8"; } elseif (substr($head, 0, 7) == "þÿ#!") { $script_type = "UTF-16BE"; } elseif (substr($head, 0, 7) == "ÿþ#!") { $script_type = "UTF-16LE"; } if ($script_type) { if ($script_type !== "UTF-8" && $script_type !== "ASCII") { // Quick and dirty fold down to ASCII! $pack = array('UTF-16BE' => 'n*', 'UTF-16LE' => 'v*'); $chars = unpack($pack[$script_type], substr($head, 2)); $head = ''; foreach ($chars as $codepoint) { if ($codepoint < 128) { $head .= chr($codepoint); } else { $head .= '?'; } } } $match = array(); if (preg_match('%/?([^\\s]+/)(\\w+)%', $head, $match)) { $mime = "application/x-{$match[2]}"; wfDebug(__METHOD__ . ": shell script recognized as {$mime}\n"); return $mime; } } // Check for ZIP variants (before getimagesize) if (strpos($tail, "PK") !== false) { wfDebug(__METHOD__ . ": ZIP header present in {$file}\n"); return $this->detectZipType($head, $tail, $ext); } wfSuppressWarnings(); $gis = getimagesize($file); wfRestoreWarnings(); if ($gis && isset($gis['mime'])) { $mime = $gis['mime']; wfDebug(__METHOD__ . ": getimagesize detected {$file} as {$mime}\n"); return $mime; } // Also test DjVu $deja = new DjVuImage($file); if ($deja->isValid()) { wfDebug(__METHOD__ . ": detected {$file} as image/vnd.djvu\n"); return 'image/vnd.djvu'; } return false; }
/** * @covers XMLTypeCheck::newFromString */ public function testMalFormedXML() { $testXML = XmlTypeCheck::newFromString(self::MAL_FORMED_XML); $this->assertFalse($testXML->wellFormed); }
function doGuessMimeType($file, $ext = true) { // Read a chunk of the file wfSuppressWarnings(); $f = fopen($file, "rt"); wfRestoreWarnings(); if (!$f) { return "unknown/unknown"; } $head = fread($f, 1024); fseek($f, -65558, SEEK_END); $tail = fread($f, 65558); // 65558 = maximum size of a zip EOCDR fclose($f); // Hardcode a few magic number checks... $headers = array('MThd' => 'audio/midi', 'OggS' => 'application/ogg', "\t" => 'application/x-msmetafile', "×ÍÆš" => 'application/x-msmetafile', '%PDF' => 'application/pdf', 'gimp xcf' => 'image/x-xcf', 'MZ' => 'application/octet-stream', "Êþº¾" => 'application/octet-stream', "ELF" => 'application/octet-stream'); foreach ($headers as $magic => $candidate) { if (strncmp($head, $magic, strlen($magic)) == 0) { wfDebug(__METHOD__ . ": magic header in {$file} recognized as {$candidate}\n"); return $candidate; } } /* * look for PHP * Check for this before HTML/XML... * Warning: this is a heuristic, and won't match a file with a lot of non-PHP before. * It will also match text files which could be PHP. :) */ if (strpos($head, '<?php') !== false || strpos($head, '<? ') !== false || strpos($head, "<?\n") !== false || strpos($head, "<?\t") !== false || strpos($head, "<?=") !== false || strpos($head, "<?php") !== false || strpos($head, "<? ") !== false || strpos($head, "<?\n") !== false || strpos($head, "<?\t") !== false || strpos($head, "<?=") !== false) { wfDebug(__METHOD__ . ": recognized {$file} as application/x-php\n"); return "application/x-php"; } /* * look for XML formats (XHTML and SVG) */ $xml = new XmlTypeCheck($file); if ($xml->wellFormed) { global $wgXMLMimeTypes; if (isset($wgXMLMimeTypes[$xml->getRootElement()])) { return $wgXMLMimeTypes[$xml->getRootElement()]; } else { return 'application/xml'; } } /* * look for shell scripts */ $script_type = NULL; # detect by shebang if (substr($head, 0, 2) == "#!") { $script_type = "ASCII"; } elseif (substr($head, 0, 5) == "#!") { $script_type = "UTF-8"; } elseif (substr($head, 0, 7) == "þÿ#!") { $script_type = "UTF-16BE"; } elseif (substr($head, 0, 7) == "ÿþ#!") { $script_type = "UTF-16LE"; } if ($script_type) { if ($script_type !== "UTF-8" && $script_type !== "ASCII") { // Quick and dirty fold down to ASCII! $pack = array('UTF-16BE' => 'n*', 'UTF-16LE' => 'v*'); $chars = unpack($pack[$script_type], substr($head, 2)); $head = ''; foreach ($chars as $codepoint) { if ($codepoint < 128) { $head .= chr($codepoint); } else { $head .= '?'; } } } $match = array(); if (preg_match('%/?([^\\s]+/)(\\w+)%', $head, $match)) { $mime = "application/x-{$match[2]}"; wfDebug(__METHOD__ . ": shell script recognized as {$mime}\n"); return $mime; } } // Check for ZIP (before getimagesize) if (strpos($tail, "PK") !== false) { wfDebug(__METHOD__ . ": ZIP header present at end of {$file}\n"); return $this->detectZipType($head); } wfSuppressWarnings(); $gis = getimagesize($file); wfRestoreWarnings(); if ($gis && isset($gis['mime'])) { $mime = $gis['mime']; wfDebug(__METHOD__ . ": getimagesize detected {$file} as {$mime}\n"); return $mime; } // Also test DjVu $deja = new DjVuImage($file); if ($deja->isValid()) { wfDebug(__METHOD__ . ": detected {$file} as image/vnd.djvu\n"); return 'image/vnd.djvu'; } return false; }
/** * Guess the MIME type from the file contents. * * @todo Remove $ext param * * @param string $file * @param mixed $ext * @return bool|string * @throws UnexpectedValueException */ private function doGuessMimeType($file, $ext) { // Read a chunk of the file MediaWiki\suppressWarnings(); $f = fopen($file, 'rb'); MediaWiki\restoreWarnings(); if (!$f) { return 'unknown/unknown'; } $fsize = filesize($file); if ($fsize === false) { return 'unknown/unknown'; } $head = fread($f, 1024); $tailLength = min(65558, $fsize); // 65558 = maximum size of a zip EOCDR if (fseek($f, -1 * $tailLength, SEEK_END) === -1) { throw new UnexpectedValueException("Seeking {$tailLength} bytes from EOF failed in " . __METHOD__); } $tail = $tailLength ? fread($f, $tailLength) : ''; fclose($f); $this->logger->info(__METHOD__ . ": analyzing head and tail of {$file} for magic numbers.\n"); // Hardcode a few magic number checks... $headers = ['MThd' => 'audio/midi', 'OggS' => 'application/ogg', "\t" => 'application/x-msmetafile', "×ÍÆš" => 'application/x-msmetafile', '%PDF' => 'application/pdf', 'gimp xcf' => 'image/x-xcf', 'MZ' => 'application/octet-stream', "Êþº¾" => 'application/octet-stream', "ELF" => 'application/octet-stream']; foreach ($headers as $magic => $candidate) { if (strncmp($head, $magic, strlen($magic)) == 0) { $this->logger->info(__METHOD__ . ": magic header in {$file} recognized as {$candidate}\n"); return $candidate; } } /* Look for WebM and Matroska files */ if (strncmp($head, pack("C4", 0x1a, 0x45, 0xdf, 0xa3), 4) == 0) { $doctype = strpos($head, "B‚"); if ($doctype) { // Next byte is datasize, then data (sizes larger than 1 byte are stupid muxers) $data = substr($head, $doctype + 3, 8); if (strncmp($data, "matroska", 8) == 0) { $this->logger->info(__METHOD__ . ": recognized file as video/x-matroska\n"); return "video/x-matroska"; } elseif (strncmp($data, "webm", 4) == 0) { $this->logger->info(__METHOD__ . ": recognized file as video/webm\n"); return "video/webm"; } } $this->logger->info(__METHOD__ . ": unknown EBML file\n"); return "unknown/unknown"; } /* Look for WebP */ if (strncmp($head, "RIFF", 4) == 0 && strncmp(substr($head, 8, 7), "WEBPVP8", 7) == 0) { $this->logger->info(__METHOD__ . ": recognized file as image/webp\n"); return "image/webp"; } /** * Look for PHP. Check for this before HTML/XML... Warning: this is a * heuristic, and won't match a file with a lot of non-PHP before. It * will also match text files which could be PHP. :) * * @todo FIXME: For this reason, the check is probably useless -- an attacker * could almost certainly just pad the file with a lot of nonsense to * circumvent the check in any case where it would be a security * problem. On the other hand, it causes harmful false positives (bug * 16583). The heuristic has been cut down to exclude three-character * strings like "<? ", but should it be axed completely? */ if (strpos($head, '<?php') !== false || strpos($head, "<?php") !== false || strpos($head, "<? ") !== false || strpos($head, "<?\n") !== false || strpos($head, "<?\t") !== false || strpos($head, "<?=") !== false) { $this->logger->info(__METHOD__ . ": recognized {$file} as application/x-php\n"); return 'application/x-php'; } /** * look for XML formats (XHTML and SVG) */ $xml = new XmlTypeCheck($file); if ($xml->wellFormed) { $xmlTypes = $this->xmlTypes; if (isset($xmlTypes[$xml->getRootElement()])) { return $xmlTypes[$xml->getRootElement()]; } else { return 'application/xml'; } } /** * look for shell scripts */ $script_type = null; # detect by shebang if (substr($head, 0, 2) == "#!") { $script_type = "ASCII"; } elseif (substr($head, 0, 5) == "#!") { $script_type = "UTF-8"; } elseif (substr($head, 0, 7) == "þÿ#!") { $script_type = "UTF-16BE"; } elseif (substr($head, 0, 7) == "ÿþ#!") { $script_type = "UTF-16LE"; } if ($script_type) { if ($script_type !== "UTF-8" && $script_type !== "ASCII") { // Quick and dirty fold down to ASCII! $pack = ['UTF-16BE' => 'n*', 'UTF-16LE' => 'v*']; $chars = unpack($pack[$script_type], substr($head, 2)); $head = ''; foreach ($chars as $codepoint) { if ($codepoint < 128) { $head .= chr($codepoint); } else { $head .= '?'; } } } $match = []; if (preg_match('%/?([^\\s]+/)(\\w+)%', $head, $match)) { $mime = "application/x-{$match[2]}"; $this->logger->info(__METHOD__ . ": shell script recognized as {$mime}\n"); return $mime; } } // Check for ZIP variants (before getimagesize) if (strpos($tail, "PK") !== false) { $this->logger->info(__METHOD__ . ": ZIP header present in {$file}\n"); return $this->detectZipType($head, $tail, $ext); } MediaWiki\suppressWarnings(); $gis = getimagesize($file); MediaWiki\restoreWarnings(); if ($gis && isset($gis['mime'])) { $mime = $gis['mime']; $this->logger->info(__METHOD__ . ": getimagesize detected {$file} as {$mime}\n"); return $mime; } # Media handling extensions can guess the MIME by content # It's intentionally here so that if core is wrong about a type (false positive), # people will hopefully nag and submit patches :) $mime = false; # Some strings by reference for performance - assuming well-behaved hooks $callback = $this->guessCallback; if ($callback) { $callback($this, $head, $tail, $file, $mime); } return $mime; }