From cfcf9cd9b7acd1dec2cc5c079ccac373bd8b392c Mon Sep 17 00:00:00 2001 From: Ryan Liptak Date: Fri, 23 Jun 2023 01:50:45 -0700 Subject: [PATCH 1/2] Add `os.windows.nls.upcaseW`, a cross-platform implementation of `RtlUpcaseUnicodeChar` This allows doing Windows-style case insensitive comparisons from any target, but means that it will need to include its own copy of the uppercase data table (5,088 bytes) to do so. When targeting Windows, the ntdll functions are used instead to avoid including a redundant copy of the uppercase data in the resulting binary. --- lib/std/os/windows.zig | 84 ++++++++++++++++++- lib/std/os/windows/nls.zig | 166 +++++++++++++++++++++++++++++++++++++ 2 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 lib/std/os/windows/nls.zig diff --git a/lib/std/os/windows.zig b/lib/std/os/windows.zig index e12e8ac4d3..1337efdd34 100644 --- a/lib/std/os/windows.zig +++ b/lib/std/os/windows.zig @@ -29,6 +29,7 @@ pub const ws2_32 = @import("windows/ws2_32.zig"); pub const gdi32 = @import("windows/gdi32.zig"); pub const winmm = @import("windows/winmm.zig"); pub const crypt32 = @import("windows/crypt32.zig"); +pub const nls = @import("windows/nls.zig"); pub const self_process_handle = @as(HANDLE, @ptrFromInt(maxInt(usize))); @@ -1911,8 +1912,31 @@ pub fn nanoSecondsToFileTime(ns: i128) FILETIME { }; } -/// Compares two WTF16 strings using RtlEqualUnicodeString +/// Compares two WTF16 strings using the equivalent functionality of +/// `RtlEqualUnicodeString` (with case insensitive comparison enabled). +/// This function can be called on any target. pub fn eqlIgnoreCaseWTF16(a: []const u16, b: []const u16) bool { + if (@inComptime() or builtin.os.tag != .windows) { + // This function compares the strings code unit by code unit (aka u16-to-u16), + // so any length difference implies inequality. In other words, there's no possible + // conversion that changes the number of UTF-16 code units needed for the uppercase/lowercase + // version in the conversion table since only codepoints <= max(u16) are eligible + // for conversion at all. + if (a.len != b.len) return false; + + for (a, b) |a_c, b_c| { + // The slices are always UTF-16 LE, so need to convert the elements to native + // endianness for the uppercasing + const a_c_native = std.mem.littleToNative(u16, a_c); + const b_c_native = std.mem.littleToNative(u16, b_c); + if (a_c != b_c and nls.upcaseW(a_c_native) != nls.upcaseW(b_c_native)) { + return false; + } + } + return true; + } + // Use RtlEqualUnicodeString on Windows when not in comptime to avoid including a + // redundant copy of the uppercase data. const a_bytes = @as(u16, @intCast(a.len * 2)); const a_string = UNICODE_STRING{ .Length = a_bytes, @@ -1928,6 +1952,64 @@ pub fn eqlIgnoreCaseWTF16(a: []const u16, b: []const u16) bool { return ntdll.RtlEqualUnicodeString(&a_string, &b_string, TRUE) == TRUE; } +/// Compares two UTF-8 strings using the equivalent functionality of +/// `RtlEqualUnicodeString` (with case insensitive comparison enabled). +/// This function can be called on any target. +/// Assumes `a` and `b` are valid UTF-8. +pub fn eqlIgnoreCaseUtf8(a: []const u8, b: []const u8) bool { + // A length equality check is not possible here because there are + // some codepoints that have a different length uppercase UTF-8 representations + // than their lowercase counterparts, e.g. U+0250 (2 bytes) <-> U+2C6F (3 bytes). + // There are 7 such codepoints in the uppercase data used by Windows. + + var a_utf8_it = std.unicode.Utf8View.initUnchecked(a).iterator(); + var b_utf8_it = std.unicode.Utf8View.initUnchecked(b).iterator(); + + // Use RtlUpcaseUnicodeChar on Windows when not in comptime to avoid including a + // redundant copy of the uppercase data. + const upcaseImpl = switch (builtin.os.tag) { + .windows => if (@inComptime()) nls.upcaseW else ntdll.RtlUpcaseUnicodeChar, + else => nls.upcaseW, + }; + + while (true) { + var a_cp = a_utf8_it.nextCodepoint() orelse break; + var b_cp = b_utf8_it.nextCodepoint() orelse return false; + + if (a_cp <= std.math.maxInt(u16) and b_cp <= std.math.maxInt(u16)) { + if (a_cp != b_cp and upcaseImpl(@intCast(a_cp)) != upcaseImpl(@intCast(b_cp))) { + return false; + } + } else if (a_cp != b_cp) { + return false; + } + } + // Make sure there are no leftover codepoints in b + if (b_utf8_it.nextCodepoint() != null) return false; + + return true; +} + +fn testEqlIgnoreCase(comptime expect_eql: bool, comptime a: []const u8, comptime b: []const u8) !void { + try std.testing.expectEqual(expect_eql, eqlIgnoreCaseUtf8(a, b)); + try std.testing.expectEqual(expect_eql, eqlIgnoreCaseWTF16( + std.unicode.utf8ToUtf16LeStringLiteral(a), + std.unicode.utf8ToUtf16LeStringLiteral(b), + )); + + try comptime std.testing.expect(expect_eql == eqlIgnoreCaseUtf8(a, b)); + try comptime std.testing.expect(expect_eql == eqlIgnoreCaseWTF16( + std.unicode.utf8ToUtf16LeStringLiteral(a), + std.unicode.utf8ToUtf16LeStringLiteral(b), + )); +} + +test "eqlIgnoreCaseWTF16/Utf8" { + try testEqlIgnoreCase(true, "\x01 a B Λ ɐ", "\x01 A b λ Ɐ"); + // does not do case-insensitive comparison for codepoints >= U+10000 + try testEqlIgnoreCase(false, "𐓏", "𐓷"); +} + pub const PathSpace = struct { data: [PATH_MAX_WIDE:0]u16, len: usize, diff --git a/lib/std/os/windows/nls.zig b/lib/std/os/windows/nls.zig new file mode 100644 index 0000000000..b204997b61 --- /dev/null +++ b/lib/std/os/windows/nls.zig @@ -0,0 +1,166 @@ +//! Implementations of functionality related to National Language Support +//! on Windows. + +const builtin = @import("builtin"); +const std = @import("../../std.zig"); + +/// This corresponds to the uppercase table within the locale-independent +/// l_intl.nls data (found at system32\l_intl.nls). +/// - In l_intl.nls, this data starts at offset 0x04. +/// - In the PEB, this data starts at index [2] of peb.UnicodeCaseTableData when +/// it is casted to `[*]u16`. +/// +/// Note: This data has not changed since Windows 8.1, and has become out-of-sync with +/// the Unicode standard. +const uppercase_table = [2544]u16{ + 272, 288, 304, 320, 336, 352, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 368, 384, 400, 256, 416, 256, 256, 432, 256, 256, 256, 256, 256, 256, 256, 448, 464, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 480, 496, + 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 512, 528, 528, 528, 528, 528, 528, 528, 528, + 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 544, 560, 528, 528, 528, 576, 528, 528, 592, 608, + 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, + 1008, 1024, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 1040, 528, 528, 1056, 528, 528, 1072, 1088, 1104, 1120, 1136, 1152, + 528, 528, 528, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 528, 528, 528, 1424, 1440, + 1456, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 1472, 528, 528, 528, 528, 528, 528, 528, 528, + 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, + 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 528, 528, 528, 528, 2000, 528, 528, 2016, 2032, 528, 528, 528, 528, 528, 528, 528, + 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 2048, 2064, 528, 528, 528, 528, 2080, 2096, 2112, 2128, 2144, + 2160, 2176, 2192, 2208, 2224, 2240, 2256, 528, 2272, 2288, 2304, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, + 528, 528, 528, 528, 2320, 2336, 2352, 528, 2368, 2384, 528, 528, 528, 528, 528, 528, 528, 528, 2400, 2416, 2432, 2448, 2464, 2480, + 2496, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 2512, 2528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65504, 65504, 65504, 65504, 65504, 65504, 65504, + 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, + 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 0, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 121, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 0, 0, 65535, 0, 65535, 0, 65535, 0, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, + 65535, 0, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 0, 65535, 0, 65535, 0, 65535, 0, 195, 0, 0, 65535, 0, 65535, 0, 0, 65535, 0, 0, 0, 65535, 0, 0, 0, + 0, 0, 65535, 0, 0, 97, 0, 0, 0, 65535, 163, 0, 0, 0, 130, 0, 0, 65535, 0, 65535, 0, 65535, 0, 0, + 65535, 0, 0, 0, 0, 65535, 0, 0, 65535, 0, 0, 0, 65535, 0, 65535, 0, 0, 65535, 0, 0, 0, 65535, 0, 56, + 0, 0, 0, 0, 0, 0, 65534, 0, 0, 65534, 0, 0, 65534, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, + 65535, 0, 65535, 0, 65535, 65457, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 0, 0, 65534, 0, 65535, 0, 0, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 0, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 0, 0, 0, + 0, 0, 0, 0, 65535, 0, 0, 0, 0, 0, 65535, 0, 0, 0, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 10783, 10780, 0, 65326, 65330, 0, 65331, 65331, 0, 65334, 0, 65333, 0, 0, 0, 0, 65331, 0, 0, 65329, 0, 0, 0, 0, + 65327, 65325, 0, 10743, 0, 0, 0, 65325, 0, 10749, 65323, 0, 0, 65322, 0, 0, 0, 0, 0, 0, 0, 10727, 0, 0, + 65318, 0, 0, 65318, 0, 0, 0, 0, 65318, 65467, 65319, 65319, 65465, 0, 0, 0, 0, 0, 65317, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 65535, 0, 65535, 0, 0, 0, 65535, 0, 0, 0, 130, 130, 130, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 65498, 65499, 65499, 65499, 0, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, + 65504, 65504, 0, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65472, 65473, 65473, 0, 0, 0, 0, 0, 0, 0, 0, 65528, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 0, 7, 0, 0, 0, 0, 0, 65535, 0, 0, 65535, 0, 0, 0, 0, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, + 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, + 65456, 65456, 65456, 65456, 65456, 65456, 65456, 65456, 65456, 65456, 65456, 65456, 65456, 65456, 65456, 65456, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 0, 65535, 0, 65535, 0, 65535, 0, + 65535, 0, 65535, 0, 65535, 0, 65535, 65521, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, + 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 35332, 0, 0, 0, 3814, 0, 0, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 8, 8, 8, 8, 8, 8, 8, 8, + 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, + 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, + 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 86, 86, 86, 86, 100, 100, 128, 128, 112, 112, 126, 126, 0, 0, + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, + 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, + 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65508, 0, + 65520, 65520, 65520, 65520, 65520, 65520, 65520, 65520, 65520, 65520, 65520, 65520, 65520, 65520, 65520, 65520, 0, 0, 0, 0, 65535, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 65510, 65510, 65510, 65510, 65510, 65510, 65510, 65510, 65510, 65510, 65510, 65510, 65510, 65510, 65510, 65510, + 65510, 65510, 65510, 65510, 65510, 65510, 65510, 65510, 65510, 65510, 0, 0, 0, 0, 0, 0, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, + 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, + 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 65488, 0, 0, 65535, 0, 0, 0, 54741, 54744, 0, + 65535, 0, 65535, 0, 65535, 0, 0, 0, 0, 0, 0, 65535, 0, 0, 65535, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, + 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, 58272, + 58272, 58272, 58272, 58272, 58272, 58272, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 0, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 0, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 0, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65535, 0, 65535, 0, 0, 65535, + 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 0, 0, 0, 65535, 0, 0, 0, 0, 65504, 65504, 65504, 65504, 65504, 65504, 65504, + 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 65504, 0, 0, 0, 0, 0, +}; + +/// Cross-platform implementation of `ntdll.RtlUpcaseUnicodeChar`. +/// Transforms the UTF-16 code unit in `c` to its uppercased version +/// if there is one. Otherwise, returns `c` unmodified. +/// +/// Note: When this function is referenced, it will need to include +/// `uppercase_table.len * 2` bytes of data in the resulting binary +/// since it depends on the `uppercase_table` data. When +/// targeting Windows, `ntdll.RtlUpcaseUnicodeChar` can be +/// used instead to avoid having to include a copy of this data. +pub fn upcaseW(c: u16) u16 { + if (c < 'a') { + return c; + } + if (c <= 'z') { + return c - ('a' - 'A'); + } + if (c >= 0xC0) { + var offset: u16 = 0; + + offset += @as(u8, @truncate(c >> 8)); + offset = uppercase_table[offset]; + offset += @as(u4, @truncate(c >> 4)); + offset = uppercase_table[offset]; + offset += @as(u4, @truncate(c)); + offset = uppercase_table[offset]; + + return c +% offset; + } + return c; +} + +test "upcaseW matches RtlUpcaseUnicodeChar" { + if (builtin.os.tag != .windows) return error.SkipZigTest; + + var c: u16 = 0; + while (true) : (c += 1) { + std.testing.expectEqual(std.os.windows.ntdll.RtlUpcaseUnicodeChar(c), upcaseW(c)) catch |err| { + std.debug.print("mismatch for codepoint U+{X}\n", .{c}); + return err; + }; + if (c == 0xFFFF) break; + } +} From 2eae013378882910257be38917cd0f9e70b80c31 Mon Sep 17 00:00:00 2001 From: Ryan Liptak Date: Mon, 19 Jun 2023 17:03:18 -0700 Subject: [PATCH 2/2] fs.path: Fix Windows path component comparison being ASCII-only We can use eqlIgnoreCaseUtf8 to get Unicode-aware Windows-compliant case insensitive path component comparison --- lib/std/fs/path.zig | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/std/fs/path.zig b/lib/std/fs/path.zig index 012d99f59a..fdad297d8d 100644 --- a/lib/std/fs/path.zig +++ b/lib/std/fs/path.zig @@ -423,8 +423,7 @@ fn networkShareServersEql(ns1: []const u8, ns2: []const u8) bool { var it1 = mem.tokenizeScalar(u8, ns1, sep1); var it2 = mem.tokenizeScalar(u8, ns2, sep2); - // TODO ASCII is wrong, we actually need full unicode support to compare paths. - return ascii.eqlIgnoreCase(it1.next().?, it2.next().?); + return windows.eqlIgnoreCaseUtf8(it1.next().?, it2.next().?); } fn compareDiskDesignators(kind: WindowsPath.Kind, p1: []const u8, p2: []const u8) bool { @@ -444,8 +443,7 @@ fn compareDiskDesignators(kind: WindowsPath.Kind, p1: []const u8, p2: []const u8 var it1 = mem.tokenizeScalar(u8, p1, sep1); var it2 = mem.tokenizeScalar(u8, p2, sep2); - // TODO ASCII is wrong, we actually need full unicode support to compare paths. - return ascii.eqlIgnoreCase(it1.next().?, it2.next().?) and ascii.eqlIgnoreCase(it1.next().?, it2.next().?); + return windows.eqlIgnoreCaseUtf8(it1.next().?, it2.next().?) and windows.eqlIgnoreCaseUtf8(it1.next().?, it2.next().?); }, } } @@ -1084,8 +1082,7 @@ pub fn relativeWindows(allocator: Allocator, from: []const u8, to: []const u8) ! const from_component = from_it.next() orelse return allocator.dupe(u8, to_it.rest()); const to_rest = to_it.rest(); if (to_it.next()) |to_component| { - // TODO ASCII is wrong, we actually need full unicode support to compare paths. - if (ascii.eqlIgnoreCase(from_component, to_component)) + if (windows.eqlIgnoreCaseUtf8(from_component, to_component)) continue; } var up_index_end = "..".len; @@ -1162,7 +1159,7 @@ test "relative" { try testRelativeWindows("c:/blah\\blah", "d:/games", "D:\\games"); try testRelativeWindows("c:/aaaa/bbbb", "c:/aaaa", ".."); try testRelativeWindows("c:/aaaa/bbbb", "c:/cccc", "..\\..\\cccc"); - try testRelativeWindows("c:/aaaa/bbbb", "c:/aaaa/bbbb", ""); + try testRelativeWindows("c:/aaaa/bbbb", "C:/aaaa/bbbb", ""); try testRelativeWindows("c:/aaaa/bbbb", "c:/aaaa/cccc", "..\\cccc"); try testRelativeWindows("c:/aaaa/", "c:/aaaa/cccc", "cccc"); try testRelativeWindows("c:/", "c:\\aaaa\\bbbb", "aaaa\\bbbb"); @@ -1188,6 +1185,10 @@ test "relative" { try testRelativeWindows("a/b/c", "a", "..\\.."); try testRelativeWindows("a/b/c", "a\\b\\c\\d", "d"); + try testRelativeWindows("\\\\FOO\\bar\\baz", "\\\\foo\\BAR\\BAZ", ""); + // Unicode-aware case-insensitive path comparison + try testRelativeWindows("\\\\кириллица\\ελληνικά\\português", "\\\\КИРИЛЛИЦА\\ΕΛΛΗΝΙΚΆ\\PORTUGUÊS", ""); + try testRelativePosix("/var/lib", "/var", ".."); try testRelativePosix("/var/lib", "/bin", "../../bin"); try testRelativePosix("/var/lib", "/var/lib", "");