Merge pull request #20297 from sno2/wtf8-conversion-buffer-overflows

std: fix buffer overflows from improper WTF encoding
This commit is contained in:
Andrew Kelley 2024-07-28 20:24:31 -07:00 committed by GitHub
commit 08e83fee57
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 112 additions and 29 deletions

View File

@ -1789,6 +1789,9 @@ pub fn symLink(
// when converting to an NT namespaced path. CreateSymbolicLink in
// symLinkW will handle the necessary conversion.
var target_path_w: windows.PathSpace = undefined;
if (try std.unicode.checkWtf8ToWtf16LeOverflow(target_path, &target_path_w.data)) {
return error.NameTooLong;
}
target_path_w.len = try std.unicode.wtf8ToWtf16Le(&target_path_w.data, target_path);
target_path_w.data[target_path_w.len] = 0;
// However, we need to canonicalize any path separators to `\`, since if

View File

@ -3124,8 +3124,10 @@ pub fn chdir(dir_path: []const u8) ChangeCurDirError!void {
@compileError("WASI does not support os.chdir");
} else if (native_os == .windows) {
var wtf16_dir_path: [windows.PATH_MAX_WIDE]u16 = undefined;
const len = try std.unicode.wtf8ToWtf16Le(wtf16_dir_path[0..], dir_path);
if (len > wtf16_dir_path.len) return error.NameTooLong;
if (try std.unicode.checkWtf8ToWtf16LeOverflow(dir_path, &wtf16_dir_path)) {
return error.NameTooLong;
}
const len = try std.unicode.wtf8ToWtf16Le(&wtf16_dir_path, dir_path);
return chdirW(wtf16_dir_path[0..len]);
} else {
const dir_path_c = try toPosixPath(dir_path);
@ -3139,9 +3141,12 @@ pub fn chdir(dir_path: []const u8) ChangeCurDirError!void {
/// On other platforms, `dir_path` is an opaque sequence of bytes with no particular encoding.
pub fn chdirZ(dir_path: [*:0]const u8) ChangeCurDirError!void {
if (native_os == .windows) {
const dir_path_span = mem.span(dir_path);
var wtf16_dir_path: [windows.PATH_MAX_WIDE]u16 = undefined;
const len = try std.unicode.wtf8ToWtf16Le(wtf16_dir_path[0..], mem.span(dir_path));
if (len > wtf16_dir_path.len) return error.NameTooLong;
if (try std.unicode.checkWtf8ToWtf16LeOverflow(dir_path_span, &wtf16_dir_path)) {
return error.NameTooLong;
}
const len = try std.unicode.wtf8ToWtf16Le(&wtf16_dir_path, dir_path_span);
return chdirW(wtf16_dir_path[0..len]);
} else if (native_os == .wasi and !builtin.link_libc) {
return chdir(mem.span(dir_path));

View File

@ -22,6 +22,15 @@ const tmpDir = std.testing.tmpDir;
const Dir = std.fs.Dir;
const ArenaAllocator = std.heap.ArenaAllocator;
// https://github.com/ziglang/zig/issues/20288
test "WTF-8 to WTF-16 conversion buffer overflows" {
if (native_os != .windows) return error.SkipZigTest;
const input_wtf8 = "\u{10FFFF}" ** 16385;
try expectError(error.NameTooLong, posix.chdir(input_wtf8));
try expectError(error.NameTooLong, posix.chdirZ(input_wtf8));
}
test "chdir smoke test" {
if (native_os == .wasi) return error.SkipZigTest;

View File

@ -1405,29 +1405,38 @@ test "ArrayList functions on a re-used list" {
}
}
/// Converts a UTF-8 string literal into a UTF-16LE string literal.
pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch |err| @compileError(err):0]u16 {
fn utf8ToUtf16LeStringLiteralImpl(comptime utf8: []const u8, comptime surrogates: Surrogates) *const [calcUtf16LeLenImpl(utf8, surrogates) catch |err| @compileError(err):0]u16 {
return comptime blk: {
const len: usize = calcUtf16LeLen(utf8) catch unreachable;
const len: usize = calcUtf16LeLenImpl(utf8, surrogates) catch unreachable;
var utf16le: [len:0]u16 = [_:0]u16{0} ** len;
const utf16le_len = utf8ToUtf16Le(&utf16le, utf8[0..]) catch |err| @compileError(err);
const utf16le_len = utf8ToUtf16LeImpl(&utf16le, utf8[0..], surrogates) catch |err| @compileError(err);
assert(len == utf16le_len);
const final = utf16le;
break :blk &final;
};
}
const CalcUtf16LeLenError = Utf8DecodeError || error{Utf8InvalidStartByte};
/// Converts a UTF-8 string literal into a UTF-16LE string literal.
pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch |err| @compileError(err):0]u16 {
return utf8ToUtf16LeStringLiteralImpl(utf8, .cannot_encode_surrogate_half);
}
/// Returns length in UTF-16 of UTF-8 slice as length of []u16.
/// Length in []u8 is 2*len16.
pub fn calcUtf16LeLen(utf8: []const u8) CalcUtf16LeLenError!usize {
/// Converts a WTF-8 string literal into a WTF-16LE string literal.
pub fn wtf8ToWtf16LeStringLiteral(comptime wtf8: []const u8) *const [calcWtf16LeLen(wtf8) catch |err| @compileError(err):0]u16 {
return utf8ToUtf16LeStringLiteralImpl(wtf8, .can_encode_surrogate_half);
}
pub fn calcUtf16LeLenImpl(utf8: []const u8, comptime surrogates: Surrogates) !usize {
const utf8DecodeImpl = switch (surrogates) {
.cannot_encode_surrogate_half => utf8Decode,
.can_encode_surrogate_half => wtf8Decode,
};
var src_i: usize = 0;
var dest_len: usize = 0;
while (src_i < utf8.len) {
const n = try utf8ByteSequenceLength(utf8[src_i]);
const next_src_i = src_i + n;
const codepoint = try utf8Decode(utf8[src_i..next_src_i]);
const codepoint = try utf8DecodeImpl(utf8[src_i..next_src_i]);
if (codepoint < 0x10000) {
dest_len += 1;
} else {
@ -1438,16 +1447,37 @@ pub fn calcUtf16LeLen(utf8: []const u8) CalcUtf16LeLenError!usize {
return dest_len;
}
fn testCalcUtf16LeLen() !void {
try testing.expectEqual(@as(usize, 1), try calcUtf16LeLen("a"));
try testing.expectEqual(@as(usize, 10), try calcUtf16LeLen("abcdefghij"));
try testing.expectEqual(@as(usize, 10), try calcUtf16LeLen("äåéëþüúíóö"));
try testing.expectEqual(@as(usize, 5), try calcUtf16LeLen("こんにちは"));
const CalcUtf16LeLenError = Utf8DecodeError || error{Utf8InvalidStartByte};
/// Returns length in UTF-16LE of UTF-8 slice as length of []u16.
/// Length in []u8 is 2*len16.
pub fn calcUtf16LeLen(utf8: []const u8) CalcUtf16LeLenError!usize {
return calcUtf16LeLenImpl(utf8, .cannot_encode_surrogate_half);
}
test "calculate utf16 string length of given utf8 string in u16" {
try testCalcUtf16LeLen();
try comptime testCalcUtf16LeLen();
const CalcWtf16LeLenError = Wtf8DecodeError || error{Utf8InvalidStartByte};
/// Returns length in WTF-16LE of WTF-8 slice as length of []u16.
/// Length in []u8 is 2*len16.
pub fn calcWtf16LeLen(wtf8: []const u8) CalcWtf16LeLenError!usize {
return calcUtf16LeLenImpl(wtf8, .can_encode_surrogate_half);
}
fn testCalcUtf16LeLenImpl(calcUtf16LeLenImpl_: anytype) !void {
try testing.expectEqual(@as(usize, 1), try calcUtf16LeLenImpl_("a"));
try testing.expectEqual(@as(usize, 10), try calcUtf16LeLenImpl_("abcdefghij"));
try testing.expectEqual(@as(usize, 10), try calcUtf16LeLenImpl_("äåéëþüúíóö"));
try testing.expectEqual(@as(usize, 5), try calcUtf16LeLenImpl_("こんにちは"));
}
test calcUtf16LeLen {
try testCalcUtf16LeLenImpl(calcUtf16LeLen);
try comptime testCalcUtf16LeLenImpl(calcUtf16LeLen);
}
test calcWtf16LeLen {
try testCalcUtf16LeLenImpl(calcWtf16LeLen);
try comptime testCalcUtf16LeLenImpl(calcWtf16LeLen);
}
/// Print the given `utf16le` string, encoded as UTF-8 bytes.
@ -1487,8 +1517,10 @@ pub fn fmtUtf16Le(utf16le: []const u16) std.fmt.Formatter(formatUtf16Le) {
test fmtUtf16Le {
const expectFmt = testing.expectFmt;
try expectFmt("", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral(""))});
try expectFmt("", "{}", .{fmtUtf16Le(wtf8ToWtf16LeStringLiteral(""))});
try expectFmt("foo", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("foo"))});
try expectFmt("𐐷", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("𐐷"))});
try expectFmt("foo", "{}", .{fmtUtf16Le(wtf8ToWtf16LeStringLiteral("foo"))});
try expectFmt("𐐷", "{}", .{fmtUtf16Le(wtf8ToWtf16LeStringLiteral("𐐷"))});
try expectFmt("", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xd7", native_endian)})});
try expectFmt("<EFBFBD>", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xd8", native_endian)})});
try expectFmt("<EFBFBD>", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xdb", native_endian)})});
@ -1497,12 +1529,12 @@ test fmtUtf16Le {
try expectFmt("", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xe0", native_endian)})});
}
test utf8ToUtf16LeStringLiteral {
fn testUtf8ToUtf16LeStringLiteral(utf8ToUtf16LeStringLiteral_: anytype) !void {
{
const bytes = [_:0]u16{
mem.nativeToLittle(u16, 0x41),
};
const utf16 = utf8ToUtf16LeStringLiteral("A");
const utf16 = utf8ToUtf16LeStringLiteral_("A");
try testing.expectEqualSlices(u16, &bytes, utf16);
try testing.expect(utf16[1] == 0);
}
@ -1511,7 +1543,7 @@ test utf8ToUtf16LeStringLiteral {
mem.nativeToLittle(u16, 0xD801),
mem.nativeToLittle(u16, 0xDC37),
};
const utf16 = utf8ToUtf16LeStringLiteral("𐐷");
const utf16 = utf8ToUtf16LeStringLiteral_("𐐷");
try testing.expectEqualSlices(u16, &bytes, utf16);
try testing.expect(utf16[2] == 0);
}
@ -1519,7 +1551,7 @@ test utf8ToUtf16LeStringLiteral {
const bytes = [_:0]u16{
mem.nativeToLittle(u16, 0x02FF),
};
const utf16 = utf8ToUtf16LeStringLiteral("\u{02FF}");
const utf16 = utf8ToUtf16LeStringLiteral_("\u{02FF}");
try testing.expectEqualSlices(u16, &bytes, utf16);
try testing.expect(utf16[1] == 0);
}
@ -1527,7 +1559,7 @@ test utf8ToUtf16LeStringLiteral {
const bytes = [_:0]u16{
mem.nativeToLittle(u16, 0x7FF),
};
const utf16 = utf8ToUtf16LeStringLiteral("\u{7FF}");
const utf16 = utf8ToUtf16LeStringLiteral_("\u{7FF}");
try testing.expectEqualSlices(u16, &bytes, utf16);
try testing.expect(utf16[1] == 0);
}
@ -1535,7 +1567,7 @@ test utf8ToUtf16LeStringLiteral {
const bytes = [_:0]u16{
mem.nativeToLittle(u16, 0x801),
};
const utf16 = utf8ToUtf16LeStringLiteral("\u{801}");
const utf16 = utf8ToUtf16LeStringLiteral_("\u{801}");
try testing.expectEqualSlices(u16, &bytes, utf16);
try testing.expect(utf16[1] == 0);
}
@ -1544,12 +1576,20 @@ test utf8ToUtf16LeStringLiteral {
mem.nativeToLittle(u16, 0xDBFF),
mem.nativeToLittle(u16, 0xDFFF),
};
const utf16 = utf8ToUtf16LeStringLiteral("\u{10FFFF}");
const utf16 = utf8ToUtf16LeStringLiteral_("\u{10FFFF}");
try testing.expectEqualSlices(u16, &bytes, utf16);
try testing.expect(utf16[2] == 0);
}
}
test utf8ToUtf16LeStringLiteral {
try testUtf8ToUtf16LeStringLiteral(utf8ToUtf16LeStringLiteral);
}
test wtf8ToWtf16LeStringLiteral {
try testUtf8ToUtf16LeStringLiteral(wtf8ToWtf16LeStringLiteral);
}
fn testUtf8CountCodepoints() !void {
try testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij"));
try testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö"));
@ -1795,6 +1835,30 @@ pub fn wtf8ToWtf16Le(wtf16le: []u16, wtf8: []const u8) error{InvalidWtf8}!usize
return utf8ToUtf16LeImpl(wtf16le, wtf8, .can_encode_surrogate_half);
}
fn checkUtf8ToUtf16LeOverflowImpl(utf8: []const u8, utf16le: []const u16, comptime surrogates: Surrogates) !bool {
// Each u8 in UTF-8/WTF-8 correlates to at most one u16 in UTF-16LE/WTF-16LE.
if (utf16le.len >= utf8.len) return false;
const utf16_len = calcUtf16LeLenImpl(utf8, surrogates) catch {
return switch (surrogates) {
.cannot_encode_surrogate_half => error.InvalidUtf8,
.can_encode_surrogate_half => error.InvalidWtf8,
};
};
return utf16_len > utf16le.len;
}
/// Checks if calling `utf8ToUtf16Le` would overflow. Might fail if utf8 is not
/// valid UTF-8.
pub fn checkUtf8ToUtf16LeOverflow(utf8: []const u8, utf16le: []const u16) error{InvalidUtf8}!bool {
return checkUtf8ToUtf16LeOverflowImpl(utf8, utf16le, .cannot_encode_surrogate_half);
}
/// Checks if calling `utf8ToUtf16Le` would overflow. Might fail if wtf8 is not
/// valid WTF-8.
pub fn checkWtf8ToWtf16LeOverflow(wtf8: []const u8, wtf16le: []const u16) error{InvalidWtf8}!bool {
return checkUtf8ToUtf16LeOverflowImpl(wtf8, wtf16le, .can_encode_surrogate_half);
}
/// Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement
/// character (U+FFFD).
/// All surrogate codepoints and the replacement character are encoded as three
@ -2000,6 +2064,8 @@ fn testRoundtripWtf8(wtf8: []const u8) !void {
{
var wtf16_buf: [32]u16 = undefined;
const wtf16_len = try wtf8ToWtf16Le(&wtf16_buf, wtf8);
try testing.expectEqual(wtf16_len, calcWtf16LeLen(wtf8));
try testing.expectEqual(false, checkWtf8ToWtf16LeOverflow(wtf8, &wtf16_buf));
const wtf16 = wtf16_buf[0..wtf16_len];
var roundtripped_buf: [32]u8 = undefined;