From 8e631ee3e7b4e7b4466c0efafaffb4151447785f Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Thu, 8 Sep 2022 20:19:10 -0700 Subject: [PATCH] translate-c: Escape non-ASCII characters that appear in macros Macro definitions are simply a slice of bytes, which may not be UTF-8 encoded. If they are not UTF-8 encoded, escape non-printable and non-ASCII characters as `\xNN`. Fixes #12784 --- src/translate_c.zig | 20 ++++++++++++++++++-- test/behavior/translate_c_macros.zig | 12 ++++++++++++ test/behavior/translate_c_macros_not_utf8.h | 5 +++++ 3 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 test/behavior/translate_c_macros_not_utf8.h diff --git a/src/translate_c.zig b/src/translate_c.zig index 014f6b1934..f969bf1c8b 100644 --- a/src/translate_c.zig +++ b/src/translate_c.zig @@ -5957,20 +5957,36 @@ fn zigifyEscapeSequences(ctx: *Context, m: *MacroCtx) ![]const u8 { return bytes[0..i]; } +/// non-ASCII characters (c > 127) are also treated as non-printable by fmtSliceEscapeLower. +/// If a C string literal or char literal in a macro is not valid UTF-8, we need to escape +/// non-ASCII characters so that the Zig source we output will itself be UTF-8. +fn escapeUnprintables(ctx: *Context, m: *MacroCtx) ![]const u8 { + const zigified = try zigifyEscapeSequences(ctx, m); + if (std.unicode.utf8ValidateSlice(zigified)) return zigified; + + const formatter = std.fmt.fmtSliceEscapeLower(zigified); + const encoded_size = @intCast(usize, std.fmt.count("{s}", .{formatter})); + var output = try ctx.arena.alloc(u8, encoded_size); + return std.fmt.bufPrint(output, "{s}", .{formatter}) catch |err| switch (err) { + error.NoSpaceLeft => unreachable, + else => |e| return e, + }; +} + fn parseCPrimaryExprInner(c: *Context, m: *MacroCtx, scope: *Scope) ParseError!Node { const tok = m.next().?; const slice = m.slice(); switch (tok) { .CharLiteral => { if (slice[0] != '\'' or slice[1] == '\\' or slice.len == 3) { - return Tag.char_literal.create(c.arena, try zigifyEscapeSequences(c, m)); + return Tag.char_literal.create(c.arena, try escapeUnprintables(c, m)); } else { const str = try std.fmt.allocPrint(c.arena, "0x{s}", .{std.fmt.fmtSliceHexLower(slice[1 .. slice.len - 1])}); return Tag.integer_literal.create(c.arena, str); } }, .StringLiteral => { - return Tag.string_literal.create(c.arena, try zigifyEscapeSequences(c, m)); + return Tag.string_literal.create(c.arena, try escapeUnprintables(c, m)); }, .IntegerLiteral, .FloatLiteral => { return parseCNumLit(c, m); diff --git a/test/behavior/translate_c_macros.zig b/test/behavior/translate_c_macros.zig index 314a9028df..04d217f488 100644 --- a/test/behavior/translate_c_macros.zig +++ b/test/behavior/translate_c_macros.zig @@ -5,6 +5,7 @@ const expectEqual = std.testing.expectEqual; const expectEqualStrings = std.testing.expectEqualStrings; const h = @cImport(@cInclude("behavior/translate_c_macros.h")); +const latin1 = @cImport(@cInclude("behavior/translate_c_macros_not_utf8.h")); test "casting to void with a macro" { h.IGNORE_ME_1(42); @@ -134,3 +135,14 @@ test "string literal macro with embedded tab character" { try expectEqualStrings("hello\t", h.EMBEDDED_TAB); } + +test "string and char literals that are not UTF-8 encoded. Issue #12784" { + if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO + + try expectEqual(@as(u8, '\xA9'), latin1.UNPRINTABLE_CHAR); + try expectEqualStrings("\xA9\xA9\xA9", latin1.UNPRINTABLE_STRING); +} diff --git a/test/behavior/translate_c_macros_not_utf8.h b/test/behavior/translate_c_macros_not_utf8.h new file mode 100644 index 0000000000..0a7fa4cc6b --- /dev/null +++ b/test/behavior/translate_c_macros_not_utf8.h @@ -0,0 +1,5 @@ +// Note: This file is encoded with ISO/IEC 8859-1 (latin1), not UTF-8. +// Do not change the encoding + +#define UNPRINTABLE_STRING "İİİ" +#define UNPRINTABLE_CHAR 'İ'