zig fmt: factorize source file reading and decoding

Now reading a source file and decoding it from UTF-16LE to UTF-8 is done in a single function. Error messages are improved, and an error is emitted when the source file has a BOM not supported (UTF-16BE, UTF-32). Please note that the BOM of UTF-32 is composed of the same bytes as the BOM of UTF-16 followed by a null character. Therefore a source file in UTF-16LE starting with a null byte will be interpreted as an UTF-32, and rejeted because of an invalid format. In pratice this is not a problem, as the code would have been rejected later anyway because of the null character.
2026-02-16 06:18:32 +00:00 · 2021-03-14 18:07:09 +01:00 · 2021-03-14 18:07:09 +01:00 · 8942243f7a
commit 8942243f7a
parent a354000090
1 changed files with 55 additions and 47 deletions
--- a/src/main.zig
+++ b/src/main.zig
@ -2637,6 +2637,50 @@ fn argvCmd(allocator: *Allocator, argv: []const []const u8) ![]u8 {
    return cmd.toOwnedSlice();
 }

+fn readSourceFileToEndAlloc(allocator: *mem.Allocator, input: *const fs.File, size_hint: ?usize) ![]const u8 {
+    const source_code = input.readToEndAllocOptions(
+        allocator,
+        max_src_size,
+        size_hint,
+        @alignOf(u16),
+        null,
+    ) catch |err| switch (err) {
+        error.ConnectionResetByPeer => unreachable,
+        error.ConnectionTimedOut => unreachable,
+        error.NotOpenForReading => unreachable,
+        else => |e| return e,
+    };
+    errdefer allocator.free(source_code);
+
+    // Detect unsupported file types with their Byte Order Mark
+    const unsupported_boms = [_][]const u8{
+        "\xff\xfe\x00\x00", // UTF-32 little endian
+        "\xfe\xff\x00\x00", // UTF-32 big endian
+        "\xfe\xff", // UTF-16 big endian
+    };
+    for (unsupported_boms) |bom| {
+        if (mem.startsWith(u8, source_code, bom)) {
+            return error.UnsupportedEncoding;
+        }
+    }
+
+    // If the file starts with a UTF-16 little endian BOM, translate it to UTF-8
+    if (mem.startsWith(u8, source_code, "\xff\xfe")) {
+        const source_code_utf16_le = mem.bytesAsSlice(u16, source_code);
+        const source_code_utf8 = std.unicode.utf16leToUtf8Alloc(allocator, source_code_utf16_le) catch |err| switch (err) {
+            error.DanglingSurrogateHalf => error.UnsupportedEncoding,
+            error.ExpectedSecondSurrogateHalf => error.UnsupportedEncoding,
+            error.UnexpectedSecondSurrogateHalf => error.UnsupportedEncoding,
+            else => |e| return e,
+        };
+
+        allocator.free(source_code);
+        return source_code_utf8;
+    }
+
+    return source_code;
+}
+
 pub const usage_fmt =
    \\Usage: zig fmt [file]...
    \\
@ -2709,20 +2753,8 @@ pub fn cmdFmt(gpa: *Allocator, args: []const []const u8) !void {
        }

        const stdin = io.getStdIn();
-
-        const source_code = blk: {
-            const source_code = try stdin.readToEndAllocOptions(gpa, max_src_size, null, @alignOf(u16), null);
-            errdefer gpa.free(source_code);
-
-            // If the file starts with a UTF-16 BOM, translate it to UTF-8
-            if (mem.startsWith(u8, source_code, "\xff\xfe")) {
-                const source_code_utf16_le = mem.bytesAsSlice(u16, source_code);
-                const source_code_utf8 = try std.unicode.utf16leToUtf8Alloc(gpa, source_code_utf16_le);
-                gpa.free(source_code);
-                break :blk source_code_utf8;
-            } else {
-                break :blk source_code;
-            }
+        const source_code = readSourceFileToEndAlloc(gpa, &stdin, null) catch |err| {
+            fatal("unable to read stdin: {s}", .{err});
        };
        defer gpa.free(source_code);

@ -2798,7 +2830,7 @@ const FmtError = error{
    EndOfStream,
    Unseekable,
    NotOpenForWriting,
-    UnknownTextFormat,
+    UnsupportedEncoding,
 } || fs.File.OpenError;

 fn fmtPath(fmt: *Fmt, file_path: []const u8, check_mode: bool, dir: fs.Dir, sub_path: []const u8) FmtError!void {
@ -2864,40 +2896,16 @@ fn fmtPathFile(
    if (stat.kind == .Directory)
        return error.IsDir;

-    const source_code = blk: {
-        const source_code = source_file.readToEndAllocOptions(
-            fmt.gpa,
-            max_src_size,
-            std.math.cast(usize, stat.size) catch return error.FileTooBig,
-            @alignOf(u16),
-            null,
-        ) catch |err| switch (err) {
-            error.ConnectionResetByPeer => unreachable,
-            error.ConnectionTimedOut => unreachable,
-            error.NotOpenForReading => unreachable,
-            else => |e| return e,
-        };
-        source_file.close();
-        file_closed = true;
-        errdefer fmt.gpa.free(source_code);
-
-        // If the file starts with a UTF-16 BOM, translate it to UTF-8
-        if (mem.eql(u8, source_code[0..2], "\xff\xfe")) {
-            const source_code_utf16_le = mem.bytesAsSlice(u16, source_code);
-            const source_code_utf8 = std.unicode.utf16leToUtf8Alloc(fmt.gpa, source_code_utf16_le) catch |err| return switch (err) {
-                error.DanglingSurrogateHalf => FmtError.UnknownTextFormat,
-                error.ExpectedSecondSurrogateHalf => FmtError.UnknownTextFormat,
-                error.UnexpectedSecondSurrogateHalf => FmtError.UnknownTextFormat,
-                else => |e| e,
-            };
-            fmt.gpa.free(source_code);
-            break :blk source_code_utf8;
-        } else {
-            break :blk source_code;
-        }
-    };
+    const source_code = try readSourceFileToEndAlloc(
+        fmt.gpa,
+        &source_file,
+        std.math.cast(usize, stat.size) catch return error.FileTooBig,
+    );
    defer fmt.gpa.free(source_code);

+    source_file.close();
+    file_closed = true;
+
    // Add to set after no longer possible to get error.IsDir.
    if (try fmt.seen.fetchPut(stat.inode, {})) |_| return;