From e0046b737eddffe0d6881d202608769fe720ff94 Mon Sep 17 00:00:00 2001
From: Vexu <git@vexu.eu>
Date: Fri, 20 Dec 2019 13:50:34 +0200
Subject: [PATCH] translate-c-2 improve macro escape sequences

---
 src-self-hosted/c_tokenizer.zig | 262 +++++++++++++++++++++++++-------
 test/translate_c.zig            |  53 ++++---
 2 files changed, 236 insertions(+), 79 deletions(-)

diff --git a/src-self-hosted/c_tokenizer.zig b/src-self-hosted/c_tokenizer.zig
index 7685cdc537..075ea86a0b 100644
--- a/src-self-hosted/c_tokenizer.zig
+++ b/src-self-hosted/c_tokenizer.zig
@@ -74,69 +74,191 @@ fn zigifyEscapeSequences(allocator: *std.mem.Allocator, tok: CToken) !CToken {
         }
     } else return tok;
     var bytes = try allocator.alloc(u8, tok.bytes.len * 2);
-    var escape = false;
+    var state: enum {
+        Start,
+        Escape,
+        Hex,
+        Octal,
+        HexZero,
+        OctalZero,
+    } = .Start;
     var i: usize = 0;
+    var count: u8 = 0;
+    var num: u8 = 0;
     for (tok.bytes) |c| {
-        if (escape) {
-            switch (c) {
-                'n', 'r', 't', '\\', '\'', '\"', 'x' => {
-                    bytes[i] = c;
-                },
-                'a' => {
-                    bytes[i] = 'x';
-                    i += 1;
-                    bytes[i] = '0';
-                    i += 1;
-                    bytes[i] = '7';
-                },
-                'b' => {
-                    bytes[i] = 'x';
-                    i += 1;
-                    bytes[i] = '0';
-                    i += 1;
-                    bytes[i] = '8';
-                },
-                'f' => {
-                    bytes[i] = 'x';
-                    i += 1;
-                    bytes[i] = '0';
-                    i += 1;
-                    bytes[i] = 'C';
-                },
-                'v' => {
-                    bytes[i] = 'x';
-                    i += 1;
-                    bytes[i] = '0';
-                    i += 1;
-                    bytes[i] = 'B';
-                },
-                '?' => {
-                    i -= 1;
-                    bytes[i] = '?';
-                },
-                'u', 'U' => {
-                    // TODO unicode escape sequences
-                    return error.TokenizingFailed;
-                },
-                '0'...'7' => {
-                    // TODO octal escape sequences
-                    return error.TokenizingFailed;
-                },
-                else => {
-                    // unknown escape sequence
-                    return error.TokenizingFailed;
-                },
-            }
-            i += 1;
-            escape = false;
-        } else {
-            if (c == '\\') {
-                escape = true;
-            }
-            bytes[i] = c;
-            i += 1;
+        switch (state) {
+            .Escape => {
+                switch (c) {
+                    'n', 'r', 't', '\\', '\'', '\"' => {
+                        bytes[i] = c;
+                    },
+                    '0' => {
+                        state = .OctalZero;
+                        bytes[i] = 'x';
+                    },
+                    '1'...'7' => {
+                        count += 1;
+                        num *= 8;
+                        num += c - '0';
+                        state = .Octal;
+                        bytes[i] = 'x';
+                    },
+                    'x' => {
+                        state = .HexZero;
+                        bytes[i] = c;
+                    },
+                    'a' => {
+                        bytes[i] = 'x';
+                        i += 1;
+                        bytes[i] = '0';
+                        i += 1;
+                        bytes[i] = '7';
+                    },
+                    'b' => {
+                        bytes[i] = 'x';
+                        i += 1;
+                        bytes[i] = '0';
+                        i += 1;
+                        bytes[i] = '8';
+                    },
+                    'f' => {
+                        bytes[i] = 'x';
+                        i += 1;
+                        bytes[i] = '0';
+                        i += 1;
+                        bytes[i] = 'C';
+                    },
+                    'v' => {
+                        bytes[i] = 'x';
+                        i += 1;
+                        bytes[i] = '0';
+                        i += 1;
+                        bytes[i] = 'B';
+                    },
+                    '?' => {
+                        i -= 1;
+                        bytes[i] = '?';
+                    },
+                    'u', 'U' => {
+                        // TODO unicode escape sequences
+                        return error.TokenizingFailed;
+                    },
+                    else => {
+                        // unknown escape sequence
+                        return error.TokenizingFailed;
+                    },
+                }
+                i += 1;
+                if (state == .Escape)
+                    state = .Start;
+            },
+            .Start => {
+                if (c == '\\') {
+                    state = .Escape;
+                }
+                bytes[i] = c;
+                i += 1;
+            },
+            .HexZero => {
+                switch (c) {
+                    '0' => { continue; },
+                    '1'...'9' => {
+                        count += 1;
+                        num *= 16;
+                        num += c - '0';
+                    },
+                    'a'...'f' => {
+                        count += 1;
+                        num *= 16;
+                        num += c - 'a' + 10;
+                    },
+                    'A'...'F' => {
+                        count += 1;
+                        num *= 16;
+                        num += c - 'A' + 10;
+                    },
+                    else => {},
+                }
+                state = .Hex;
+            },
+            .Hex => {
+                switch (c) {
+                    '0'...'9' => {
+                        count += 1;
+                        num *= 16;
+                        num += c - '0';
+                        if (count < 2)
+                            continue;
+                    },
+                    'a'...'f' => {
+                        count += 1;
+                        num *= 16;
+                        num += c - 'a' + 10;
+                        if (count < 2)
+                            continue;
+                    },
+                    'A'...'F' => {
+                        count += 1;
+                        num *= 16;
+                        num += c - 'A' + 10;
+                        if (count < 2)
+                            continue;
+                    },
+                    else => {},
+                }
+                i += std.fmt.formatIntBuf(bytes[i..], num, 16, false, std.fmt.FormatOptions{.fill = '0', .width = 2});
+                switch (c) {
+                    '\\' => state = .Escape,
+                    '0'...'9', 'a'...'f','A'...'F' => state = .Start,
+                    else => {
+                        state = .Start;
+                        bytes[i] = c;
+                        i += 1;
+                    },
+                }
+                count = 0;
+                num = 0;
+            },
+            .OctalZero => {
+                switch (c) {
+                    '0' => { continue; },
+                    '1'...'7' => {
+                        count += 1;
+                        num *= 8;
+                        num += c - '0';
+                    },
+                    else => {},
+                }
+                state = .Octal;
+            },
+            .Octal => {
+                switch (c) {
+                    '0'...'7' => {
+                        count += 1;
+                        num *= 8;
+                        num += c - '0';
+                        if (count < 3)
+                            continue;
+                    },
+                    else => {},
+                }
+                i += std.fmt.formatIntBuf(bytes[i..], num, 16, false, std.fmt.FormatOptions{.fill = '0', .width = 2});
+                switch (c) {
+                    '\\' => state = .Escape,
+                    '0'...'7' => state = .Start,
+                    else => {
+                        state = .Start;
+                        bytes[i] = c;
+                        i += 1;
+                    },
+                }
+                count = 0;
+                num = 0;
+            },
         }
     }
+    if (state == .Hex or state == .Octal)
+        i += std.fmt.formatIntBuf(bytes[i..], num, 16, false, std.fmt.FormatOptions{.fill = '0', .width = 2});
     return CToken{
         .id = tok.id,
         .bytes = bytes[0..i],
@@ -666,3 +788,25 @@ test "tokenize macro" {
     expect(it.next() == null);
     tl.shrink(0);
 }
+
+test "escape sequences" {
+    var buf: [1024]u8 = undefined;
+    var alloc = std.heap.FixedBufferAllocator.init(buf[0..]);
+    const a = &alloc.allocator;
+    expect(std.mem.eql(u8, (try zigifyEscapeSequences(a, .{
+        .id = .StrLit,
+        .bytes = "\\x0077",
+    })).bytes, "\\x77"));
+    expect(std.mem.eql(u8, (try zigifyEscapeSequences(a, .{
+        .id = .StrLit,
+        .bytes = "\\00245",
+    })).bytes, "\\xa5"));
+    expect(std.mem.eql(u8, (try zigifyEscapeSequences(a, .{
+        .id = .StrLit,
+        .bytes = "\\x0077abc",
+    })).bytes, "\\x77abc"));
+    expect(std.mem.eql(u8, (try zigifyEscapeSequences(a, .{
+        .id = .StrLit,
+        .bytes = "\\045abc",
+    })).bytes, "\\x25abc"));
+}
diff --git a/test/translate_c.zig b/test/translate_c.zig
index c3174437c6..9892d66ad9 100644
--- a/test/translate_c.zig
+++ b/test/translate_c.zig
@@ -1089,13 +1089,16 @@ pub fn addCases(cases: *tests.TranslateCContext) void {
         \\}
     });
 
-    cases.add_2("macro escape sequences",
+    cases.add_2("macro defines string literal with hex",
         \\#define FOO "aoeu\xab derp"
-        \\#define FOO2 "aoeu\a derp"
+        \\#define FOO2 "aoeu\x0007a derp"
+        \\#define FOO_CHAR '\xfF'
     , &[_][]const u8{
         \\pub const FOO = "aoeu\xab derp";
     ,
-        \\pub const FOO2 = "aoeu\x07 derp";
+        \\pub const FOO2 = "aoeu\x7a derp";
+    ,
+        \\pub const FOO_CHAR = '\xff';
     });
 
     cases.add_2("variable aliasing",
@@ -2157,30 +2160,16 @@ pub fn addCases(cases: *tests.TranslateCContext) void {
         \\}
     });
 
-    /////////////// Cases for only stage1 which are TODO items for stage2 ////////////////
-
-    cases.add("macro defines string literal with hex",
-        \\#define FOO "aoeu\xab derp"
-        \\#define FOO2 "aoeu\x0007a derp"
-        \\#define FOO_CHAR '\xfF'
-    , &[_][]const u8{
-        \\pub const FOO = "aoeu\xab derp";
-    ,
-        \\pub const FOO2 = "aoeuz derp";
-    ,
-        \\pub const FOO_CHAR = 255;
-    });
-
-    cases.add("macro defines string literal with octal",
+    cases.add_2("macro defines string literal with octal",
         \\#define FOO "aoeu\023 derp"
         \\#define FOO2 "aoeu\0234 derp"
         \\#define FOO_CHAR '\077'
     , &[_][]const u8{
         \\pub const FOO = "aoeu\x13 derp";
     ,
-        \\pub const FOO2 = "aoeu\x134 derp";
+        \\pub const FOO2 = "aoeu\x9c derp";
     ,
-        \\pub const FOO_CHAR = 63;
+        \\pub const FOO_CHAR = '\x3f';
     });
 
     /////////////// Cases for only stage1 because stage2 behavior is better ////////////////
@@ -3111,4 +3100,28 @@ pub fn addCases(cases: *tests.TranslateCContext) void {
         \\    _ = baz.?();
         \\}
     });
+
+    cases.add("macro defines string literal with hex",
+        \\#define FOO "aoeu\xab derp"
+        \\#define FOO2 "aoeu\x0007a derp"
+        \\#define FOO_CHAR '\xfF'
+    , &[_][]const u8{
+        \\pub const FOO = "aoeu\xab derp";
+    ,
+        \\pub const FOO2 = "aoeuz derp";
+    ,
+        \\pub const FOO_CHAR = 255;
+    });
+
+    cases.add("macro defines string literal with octal",
+        \\#define FOO "aoeu\023 derp"
+        \\#define FOO2 "aoeu\0234 derp"
+        \\#define FOO_CHAR '\077'
+    , &[_][]const u8{
+        \\pub const FOO = "aoeu\x13 derp";
+    ,
+        \\pub const FOO2 = "aoeu\x134 derp";
+    ,
+        \\pub const FOO_CHAR = 63;
+    });
 }