From 8365a7aab49938ff77228b72388301f562287415 Mon Sep 17 00:00:00 2001 From: hryx Date: Thu, 4 Jul 2019 14:48:23 -0700 Subject: [PATCH] Unicode escapes: stage2 tokenizer and parser test --- std/zig/parser_test.zig | 2 +- std/zig/tokenizer.zig | 113 +++++++++++++++++++++++++++++++++++----- 2 files changed, 101 insertions(+), 14 deletions(-) diff --git a/std/zig/parser_test.zig b/std/zig/parser_test.zig index f78e666779..c0355998cb 100644 --- a/std/zig/parser_test.zig +++ b/std/zig/parser_test.zig @@ -68,7 +68,7 @@ test "zig fmt: enum literal inside array literal" { test "zig fmt: character literal larger than u8" { try testCanonical( - \\const x = '\U01f4a9'; + \\const x = '\u{01f4a9}'; \\ ); } diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig index ef171c0674..46b14dcde0 100644 --- a/std/zig/tokenizer.zig +++ b/std/zig/tokenizer.zig @@ -240,6 +240,9 @@ pub const Tokenizer = struct { CharLiteral, CharLiteralBackslash, CharLiteralHexEscape, + CharLiteralUnicodeEscapeSawU, + CharLiteralUnicodeEscape, + CharLiteralUnicodeInvalid, CharLiteralEnd, Backslash, Equal, @@ -296,7 +299,6 @@ pub const Tokenizer = struct { .end = undefined, }; var seen_escape_digits: usize = undefined; - var expected_escape_digits: usize = undefined; while (self.index < self.buffer.len) : (self.index += 1) { const c = self.buffer[self.index]; switch (state) { @@ -664,17 +666,9 @@ pub const Tokenizer = struct { 'x' => { state = State.CharLiteralHexEscape; seen_escape_digits = 0; - expected_escape_digits = 2; }, 'u' => { - state = State.CharLiteralHexEscape; - seen_escape_digits = 0; - expected_escape_digits = 4; - }, - 'U' => { - state = State.CharLiteralHexEscape; - seen_escape_digits = 0; - expected_escape_digits = 6; + state = State.CharLiteralUnicodeEscapeSawU; }, else => { state = State.CharLiteralEnd; @@ -682,9 +676,9 @@ pub const Tokenizer = struct { }, State.CharLiteralHexEscape => switch (c) { - '0'...'9', 'a'...'z', 'A'...'F' => { + '0'...'9', 'a'...'f', 'A'...'F' => { seen_escape_digits += 1; - if (seen_escape_digits == expected_escape_digits) { + if (seen_escape_digits == 2) { state = State.CharLiteralEnd; } }, @@ -694,6 +688,43 @@ pub const Tokenizer = struct { }, }, + State.CharLiteralUnicodeEscapeSawU => switch (c) { + '{' => { + state = State.CharLiteralUnicodeEscape; + seen_escape_digits = 0; + }, + else => { + result.id = Token.Id.Invalid; + state = State.CharLiteralUnicodeInvalid; + }, + }, + + State.CharLiteralUnicodeEscape => switch (c) { + '0'...'9', 'a'...'f', 'A'...'F' => { + seen_escape_digits += 1; + }, + '}' => { + if (seen_escape_digits == 0) { + result.id = Token.Id.Invalid; + state = State.CharLiteralUnicodeInvalid; + } else { + state = State.CharLiteralEnd; + } + }, + else => { + result.id = Token.Id.Invalid; + state = State.CharLiteralUnicodeInvalid; + }, + }, + + State.CharLiteralUnicodeInvalid => switch (c) { + // Keep consuming characters until an obvious stopping point. + // This consolidates e.g. `u{0ab1Q}` into a single invalid token + // instead of creating the tokens `u{0ab1`, `Q`, `}` + '0'...'9', 'a'...'z', 'A'...'Z', '}' => {}, + else => break, + }, + State.CharLiteralEnd => switch (c) { '\'' => { result.id = Token.Id.CharLiteral; @@ -1055,6 +1086,9 @@ pub const Tokenizer = struct { State.CharLiteral, State.CharLiteralBackslash, State.CharLiteralHexEscape, + State.CharLiteralUnicodeEscapeSawU, + State.CharLiteralUnicodeEscape, + State.CharLiteralUnicodeInvalid, State.CharLiteralEnd, State.StringLiteralBackslash, State.LBracketStar, @@ -1208,7 +1242,60 @@ test "tokenizer - unknown length pointer and then c pointer" { test "tokenizer - char literal with hex escape" { testTokenize( \\'\x1b' - , [_]Token.Id{Token.Id.CharLiteral}); + , [_]Token.Id{.CharLiteral}); + testTokenize( + \\'\x1' + , [_]Token.Id{ .Invalid, .Invalid }); +} + +test "tokenizer - char literal with unicode escapes" { + // Valid unicode escapes + testTokenize( + \\'\u{3}' + , [_]Token.Id{.CharLiteral}); + testTokenize( + \\'\u{01}' + , [_]Token.Id{.CharLiteral}); + testTokenize( + \\'\u{2a}' + , [_]Token.Id{.CharLiteral}); + testTokenize( + \\'\u{3f9}' + , [_]Token.Id{.CharLiteral}); + testTokenize( + \\'\u{6E09aBc1523}' + , [_]Token.Id{.CharLiteral}); + testTokenize( + \\"\u{440}" + , [_]Token.Id{.StringLiteral}); + + // Invalid unicode escapes + testTokenize( + \\'\u' + , [_]Token.Id{.Invalid}); + testTokenize( + \\'\u{{' + , [_]Token.Id{ .Invalid, .Invalid }); + testTokenize( + \\'\u{}' + , [_]Token.Id{ .Invalid, .Invalid }); + testTokenize( + \\'\u{s}' + , [_]Token.Id{ .Invalid, .Invalid }); + testTokenize( + \\'\u{2z}' + , [_]Token.Id{ .Invalid, .Invalid }); + testTokenize( + \\'\u{4a' + , [_]Token.Id{.Invalid}); + + // Test old-style unicode literals + testTokenize( + \\'\u0333' + , [_]Token.Id{ .Invalid, .Invalid }); + testTokenize( + \\'\U0333' + , [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid }); } test "tokenizer - float literal e exponent" {