From e0046b737eddffe0d6881d202608769fe720ff94 Mon Sep 17 00:00:00 2001 From: Vexu Date: Fri, 20 Dec 2019 13:50:34 +0200 Subject: [PATCH] translate-c-2 improve macro escape sequences --- src-self-hosted/c_tokenizer.zig | 262 +++++++++++++++++++++++++------- test/translate_c.zig | 53 ++++--- 2 files changed, 236 insertions(+), 79 deletions(-) diff --git a/src-self-hosted/c_tokenizer.zig b/src-self-hosted/c_tokenizer.zig index 7685cdc537..075ea86a0b 100644 --- a/src-self-hosted/c_tokenizer.zig +++ b/src-self-hosted/c_tokenizer.zig @@ -74,69 +74,191 @@ fn zigifyEscapeSequences(allocator: *std.mem.Allocator, tok: CToken) !CToken { } } else return tok; var bytes = try allocator.alloc(u8, tok.bytes.len * 2); - var escape = false; + var state: enum { + Start, + Escape, + Hex, + Octal, + HexZero, + OctalZero, + } = .Start; var i: usize = 0; + var count: u8 = 0; + var num: u8 = 0; for (tok.bytes) |c| { - if (escape) { - switch (c) { - 'n', 'r', 't', '\\', '\'', '\"', 'x' => { - bytes[i] = c; - }, - 'a' => { - bytes[i] = 'x'; - i += 1; - bytes[i] = '0'; - i += 1; - bytes[i] = '7'; - }, - 'b' => { - bytes[i] = 'x'; - i += 1; - bytes[i] = '0'; - i += 1; - bytes[i] = '8'; - }, - 'f' => { - bytes[i] = 'x'; - i += 1; - bytes[i] = '0'; - i += 1; - bytes[i] = 'C'; - }, - 'v' => { - bytes[i] = 'x'; - i += 1; - bytes[i] = '0'; - i += 1; - bytes[i] = 'B'; - }, - '?' => { - i -= 1; - bytes[i] = '?'; - }, - 'u', 'U' => { - // TODO unicode escape sequences - return error.TokenizingFailed; - }, - '0'...'7' => { - // TODO octal escape sequences - return error.TokenizingFailed; - }, - else => { - // unknown escape sequence - return error.TokenizingFailed; - }, - } - i += 1; - escape = false; - } else { - if (c == '\\') { - escape = true; - } - bytes[i] = c; - i += 1; + switch (state) { + .Escape => { + switch (c) { + 'n', 'r', 't', '\\', '\'', '\"' => { + bytes[i] = c; + }, + '0' => { + state = .OctalZero; + bytes[i] = 'x'; + }, + '1'...'7' => { + count += 1; + num *= 8; + num += c - '0'; + state = .Octal; + bytes[i] = 'x'; + }, + 'x' => { + state = .HexZero; + bytes[i] = c; + }, + 'a' => { + bytes[i] = 'x'; + i += 1; + bytes[i] = '0'; + i += 1; + bytes[i] = '7'; + }, + 'b' => { + bytes[i] = 'x'; + i += 1; + bytes[i] = '0'; + i += 1; + bytes[i] = '8'; + }, + 'f' => { + bytes[i] = 'x'; + i += 1; + bytes[i] = '0'; + i += 1; + bytes[i] = 'C'; + }, + 'v' => { + bytes[i] = 'x'; + i += 1; + bytes[i] = '0'; + i += 1; + bytes[i] = 'B'; + }, + '?' => { + i -= 1; + bytes[i] = '?'; + }, + 'u', 'U' => { + // TODO unicode escape sequences + return error.TokenizingFailed; + }, + else => { + // unknown escape sequence + return error.TokenizingFailed; + }, + } + i += 1; + if (state == .Escape) + state = .Start; + }, + .Start => { + if (c == '\\') { + state = .Escape; + } + bytes[i] = c; + i += 1; + }, + .HexZero => { + switch (c) { + '0' => { continue; }, + '1'...'9' => { + count += 1; + num *= 16; + num += c - '0'; + }, + 'a'...'f' => { + count += 1; + num *= 16; + num += c - 'a' + 10; + }, + 'A'...'F' => { + count += 1; + num *= 16; + num += c - 'A' + 10; + }, + else => {}, + } + state = .Hex; + }, + .Hex => { + switch (c) { + '0'...'9' => { + count += 1; + num *= 16; + num += c - '0'; + if (count < 2) + continue; + }, + 'a'...'f' => { + count += 1; + num *= 16; + num += c - 'a' + 10; + if (count < 2) + continue; + }, + 'A'...'F' => { + count += 1; + num *= 16; + num += c - 'A' + 10; + if (count < 2) + continue; + }, + else => {}, + } + i += std.fmt.formatIntBuf(bytes[i..], num, 16, false, std.fmt.FormatOptions{.fill = '0', .width = 2}); + switch (c) { + '\\' => state = .Escape, + '0'...'9', 'a'...'f','A'...'F' => state = .Start, + else => { + state = .Start; + bytes[i] = c; + i += 1; + }, + } + count = 0; + num = 0; + }, + .OctalZero => { + switch (c) { + '0' => { continue; }, + '1'...'7' => { + count += 1; + num *= 8; + num += c - '0'; + }, + else => {}, + } + state = .Octal; + }, + .Octal => { + switch (c) { + '0'...'7' => { + count += 1; + num *= 8; + num += c - '0'; + if (count < 3) + continue; + }, + else => {}, + } + i += std.fmt.formatIntBuf(bytes[i..], num, 16, false, std.fmt.FormatOptions{.fill = '0', .width = 2}); + switch (c) { + '\\' => state = .Escape, + '0'...'7' => state = .Start, + else => { + state = .Start; + bytes[i] = c; + i += 1; + }, + } + count = 0; + num = 0; + }, } } + if (state == .Hex or state == .Octal) + i += std.fmt.formatIntBuf(bytes[i..], num, 16, false, std.fmt.FormatOptions{.fill = '0', .width = 2}); return CToken{ .id = tok.id, .bytes = bytes[0..i], @@ -666,3 +788,25 @@ test "tokenize macro" { expect(it.next() == null); tl.shrink(0); } + +test "escape sequences" { + var buf: [1024]u8 = undefined; + var alloc = std.heap.FixedBufferAllocator.init(buf[0..]); + const a = &alloc.allocator; + expect(std.mem.eql(u8, (try zigifyEscapeSequences(a, .{ + .id = .StrLit, + .bytes = "\\x0077", + })).bytes, "\\x77")); + expect(std.mem.eql(u8, (try zigifyEscapeSequences(a, .{ + .id = .StrLit, + .bytes = "\\00245", + })).bytes, "\\xa5")); + expect(std.mem.eql(u8, (try zigifyEscapeSequences(a, .{ + .id = .StrLit, + .bytes = "\\x0077abc", + })).bytes, "\\x77abc")); + expect(std.mem.eql(u8, (try zigifyEscapeSequences(a, .{ + .id = .StrLit, + .bytes = "\\045abc", + })).bytes, "\\x25abc")); +} diff --git a/test/translate_c.zig b/test/translate_c.zig index c3174437c6..9892d66ad9 100644 --- a/test/translate_c.zig +++ b/test/translate_c.zig @@ -1089,13 +1089,16 @@ pub fn addCases(cases: *tests.TranslateCContext) void { \\} }); - cases.add_2("macro escape sequences", + cases.add_2("macro defines string literal with hex", \\#define FOO "aoeu\xab derp" - \\#define FOO2 "aoeu\a derp" + \\#define FOO2 "aoeu\x0007a derp" + \\#define FOO_CHAR '\xfF' , &[_][]const u8{ \\pub const FOO = "aoeu\xab derp"; , - \\pub const FOO2 = "aoeu\x07 derp"; + \\pub const FOO2 = "aoeu\x7a derp"; + , + \\pub const FOO_CHAR = '\xff'; }); cases.add_2("variable aliasing", @@ -2157,30 +2160,16 @@ pub fn addCases(cases: *tests.TranslateCContext) void { \\} }); - /////////////// Cases for only stage1 which are TODO items for stage2 //////////////// - - cases.add("macro defines string literal with hex", - \\#define FOO "aoeu\xab derp" - \\#define FOO2 "aoeu\x0007a derp" - \\#define FOO_CHAR '\xfF' - , &[_][]const u8{ - \\pub const FOO = "aoeu\xab derp"; - , - \\pub const FOO2 = "aoeuz derp"; - , - \\pub const FOO_CHAR = 255; - }); - - cases.add("macro defines string literal with octal", + cases.add_2("macro defines string literal with octal", \\#define FOO "aoeu\023 derp" \\#define FOO2 "aoeu\0234 derp" \\#define FOO_CHAR '\077' , &[_][]const u8{ \\pub const FOO = "aoeu\x13 derp"; , - \\pub const FOO2 = "aoeu\x134 derp"; + \\pub const FOO2 = "aoeu\x9c derp"; , - \\pub const FOO_CHAR = 63; + \\pub const FOO_CHAR = '\x3f'; }); /////////////// Cases for only stage1 because stage2 behavior is better //////////////// @@ -3111,4 +3100,28 @@ pub fn addCases(cases: *tests.TranslateCContext) void { \\ _ = baz.?(); \\} }); + + cases.add("macro defines string literal with hex", + \\#define FOO "aoeu\xab derp" + \\#define FOO2 "aoeu\x0007a derp" + \\#define FOO_CHAR '\xfF' + , &[_][]const u8{ + \\pub const FOO = "aoeu\xab derp"; + , + \\pub const FOO2 = "aoeuz derp"; + , + \\pub const FOO_CHAR = 255; + }); + + cases.add("macro defines string literal with octal", + \\#define FOO "aoeu\023 derp" + \\#define FOO2 "aoeu\0234 derp" + \\#define FOO_CHAR '\077' + , &[_][]const u8{ + \\pub const FOO = "aoeu\x13 derp"; + , + \\pub const FOO2 = "aoeu\x134 derp"; + , + \\pub const FOO_CHAR = 63; + }); }