mirror of
https://github.com/ziglang/zig.git
synced 2026-01-20 14:25:16 +00:00
Merge pull request #20885 from ziglang/simplify-tokenizer
std.zig.tokenizer: simplification and spec conformance
This commit is contained in:
commit
eb1a199dff
@ -677,10 +677,10 @@ fn montReduce(x: i32) i16 {
|
||||
// Note gcd(2¹⁶, q) = 1 as q is prime. Write q' := 62209 = q⁻¹ mod R.
|
||||
// First we compute
|
||||
//
|
||||
// m := ((x mod R) q') mod R
|
||||
// m := ((x mod R) q') mod R
|
||||
// = x q' mod R
|
||||
// = int16(x q')
|
||||
// = int16(int32(x) * int32(q'))
|
||||
// = int16(x q')
|
||||
// = int16(int32(x) * int32(q'))
|
||||
//
|
||||
// Note that x q' might be as big as 2³² and could overflow the int32
|
||||
// multiplication in the last line. However for any int32s a and b,
|
||||
|
||||
@ -203,8 +203,7 @@ pub const symtab_command = extern struct {
|
||||
/// local symbols (static and debugging symbols) - grouped by module
|
||||
/// defined external symbols - grouped by module (sorted by name if not lib)
|
||||
/// undefined external symbols (sorted by name if MH_BINDATLOAD is not set,
|
||||
/// and in order the were seen by the static
|
||||
/// linker if MH_BINDATLOAD is set)
|
||||
/// and in order the were seen by the static linker if MH_BINDATLOAD is set)
|
||||
/// In this load command there are offsets and counts to each of the three groups
|
||||
/// of symbols.
|
||||
///
|
||||
@ -219,9 +218,9 @@ pub const symtab_command = extern struct {
|
||||
/// shared library. For executable and object modules, which are files
|
||||
/// containing only one module, the information that would be in these three
|
||||
/// tables is determined as follows:
|
||||
/// table of contents - the defined external symbols are sorted by name
|
||||
/// module table - the file contains only one module so everything in the
|
||||
/// file is part of the module.
|
||||
/// table of contents - the defined external symbols are sorted by name
|
||||
/// module table - the file contains only one module so everything in the file
|
||||
/// is part of the module.
|
||||
/// reference symbol table - is the defined and undefined external symbols
|
||||
///
|
||||
/// For dynamically linked shared library files this load command also contains
|
||||
|
||||
@ -95,16 +95,13 @@ pub inline fn utf8EncodeComptime(comptime c: u21) [
|
||||
|
||||
const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
|
||||
|
||||
/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
|
||||
/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
|
||||
/// If you already know the length at comptime, you can call one of
|
||||
/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
|
||||
/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
|
||||
pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 {
|
||||
return switch (bytes.len) {
|
||||
1 => @as(u21, bytes[0]),
|
||||
2 => utf8Decode2(bytes),
|
||||
3 => utf8Decode3(bytes),
|
||||
4 => utf8Decode4(bytes),
|
||||
1 => bytes[0],
|
||||
2 => utf8Decode2(bytes[0..2].*),
|
||||
3 => utf8Decode3(bytes[0..3].*),
|
||||
4 => utf8Decode4(bytes[0..4].*),
|
||||
else => unreachable,
|
||||
};
|
||||
}
|
||||
@ -113,8 +110,7 @@ const Utf8Decode2Error = error{
|
||||
Utf8ExpectedContinuation,
|
||||
Utf8OverlongEncoding,
|
||||
};
|
||||
pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
|
||||
assert(bytes.len == 2);
|
||||
pub fn utf8Decode2(bytes: [2]u8) Utf8Decode2Error!u21 {
|
||||
assert(bytes[0] & 0b11100000 == 0b11000000);
|
||||
var value: u21 = bytes[0] & 0b00011111;
|
||||
|
||||
@ -130,7 +126,7 @@ pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
|
||||
const Utf8Decode3Error = Utf8Decode3AllowSurrogateHalfError || error{
|
||||
Utf8EncodesSurrogateHalf,
|
||||
};
|
||||
pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 {
|
||||
pub fn utf8Decode3(bytes: [3]u8) Utf8Decode3Error!u21 {
|
||||
const value = try utf8Decode3AllowSurrogateHalf(bytes);
|
||||
|
||||
if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
|
||||
@ -142,8 +138,7 @@ const Utf8Decode3AllowSurrogateHalfError = error{
|
||||
Utf8ExpectedContinuation,
|
||||
Utf8OverlongEncoding,
|
||||
};
|
||||
pub fn utf8Decode3AllowSurrogateHalf(bytes: []const u8) Utf8Decode3AllowSurrogateHalfError!u21 {
|
||||
assert(bytes.len == 3);
|
||||
pub fn utf8Decode3AllowSurrogateHalf(bytes: [3]u8) Utf8Decode3AllowSurrogateHalfError!u21 {
|
||||
assert(bytes[0] & 0b11110000 == 0b11100000);
|
||||
var value: u21 = bytes[0] & 0b00001111;
|
||||
|
||||
@ -165,8 +160,7 @@ const Utf8Decode4Error = error{
|
||||
Utf8OverlongEncoding,
|
||||
Utf8CodepointTooLarge,
|
||||
};
|
||||
pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
|
||||
assert(bytes.len == 4);
|
||||
pub fn utf8Decode4(bytes: [4]u8) Utf8Decode4Error!u21 {
|
||||
assert(bytes[0] & 0b11111000 == 0b11110000);
|
||||
var value: u21 = bytes[0] & 0b00000111;
|
||||
|
||||
@ -1637,12 +1631,13 @@ pub fn wtf8Encode(c: u21, out: []u8) error{CodepointTooLarge}!u3 {
|
||||
|
||||
const Wtf8DecodeError = Utf8Decode2Error || Utf8Decode3AllowSurrogateHalfError || Utf8Decode4Error;
|
||||
|
||||
/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
|
||||
pub fn wtf8Decode(bytes: []const u8) Wtf8DecodeError!u21 {
|
||||
return switch (bytes.len) {
|
||||
1 => @as(u21, bytes[0]),
|
||||
2 => utf8Decode2(bytes),
|
||||
3 => utf8Decode3AllowSurrogateHalf(bytes),
|
||||
4 => utf8Decode4(bytes),
|
||||
1 => bytes[0],
|
||||
2 => utf8Decode2(bytes[0..2].*),
|
||||
3 => utf8Decode3AllowSurrogateHalf(bytes[0..3].*),
|
||||
4 => utf8Decode4(bytes[0..4].*),
|
||||
else => unreachable,
|
||||
};
|
||||
}
|
||||
|
||||
@ -69,7 +69,7 @@ pub fn parse(gpa: Allocator, source: [:0]const u8, mode: Mode) Allocator.Error!A
|
||||
const token = tokenizer.next();
|
||||
try tokens.append(gpa, .{
|
||||
.tag = token.tag,
|
||||
.start = @as(u32, @intCast(token.loc.start)),
|
||||
.start = @intCast(token.loc.start),
|
||||
});
|
||||
if (token.tag == .eof) break;
|
||||
}
|
||||
|
||||
@ -11351,6 +11351,9 @@ fn failWithStrLitError(astgen: *AstGen, err: std.zig.string_literal.Error, token
|
||||
.{raw_string[bad_index]},
|
||||
);
|
||||
},
|
||||
.empty_char_literal => {
|
||||
return astgen.failOff(token, offset, "empty character literal", .{});
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@ -13820,21 +13823,9 @@ fn lowerAstErrors(astgen: *AstGen) !void {
|
||||
var msg: std.ArrayListUnmanaged(u8) = .{};
|
||||
defer msg.deinit(gpa);
|
||||
|
||||
const token_starts = tree.tokens.items(.start);
|
||||
const token_tags = tree.tokens.items(.tag);
|
||||
|
||||
var notes: std.ArrayListUnmanaged(u32) = .{};
|
||||
defer notes.deinit(gpa);
|
||||
|
||||
const tok = parse_err.token + @intFromBool(parse_err.token_is_prev);
|
||||
if (token_tags[tok] == .invalid) {
|
||||
const bad_off: u32 = @intCast(tree.tokenSlice(tok).len);
|
||||
const byte_abs = token_starts[tok] + bad_off;
|
||||
try notes.append(gpa, try astgen.errNoteTokOff(tok, bad_off, "invalid byte: '{'}'", .{
|
||||
std.zig.fmtEscapes(tree.source[byte_abs..][0..1]),
|
||||
}));
|
||||
}
|
||||
|
||||
for (tree.errors[1..]) |note| {
|
||||
if (!note.is_note) break;
|
||||
|
||||
|
||||
@ -6061,7 +6061,6 @@ test "recovery: invalid container members" {
|
||||
, &[_]Error{
|
||||
.expected_expr,
|
||||
.expected_comma_after_field,
|
||||
.expected_type_expr,
|
||||
.expected_semi_after_stmt,
|
||||
});
|
||||
}
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
const std = @import("../std.zig");
|
||||
const assert = std.debug.assert;
|
||||
const utf8Decode = std.unicode.utf8Decode;
|
||||
const utf8Encode = std.unicode.utf8Encode;
|
||||
|
||||
pub const ParseError = error{
|
||||
@ -37,12 +36,16 @@ pub const Error = union(enum) {
|
||||
expected_single_quote: usize,
|
||||
/// The character at this index cannot be represented without an escape sequence.
|
||||
invalid_character: usize,
|
||||
/// `''`. Not returned for string literals.
|
||||
empty_char_literal,
|
||||
};
|
||||
|
||||
/// Only validates escape sequence characters.
|
||||
/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between.
|
||||
/// Asserts the slice starts and ends with single-quotes.
|
||||
/// Returns an error if there is not exactly one UTF-8 codepoint in between.
|
||||
pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
|
||||
assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\'');
|
||||
if (slice.len < 3) return .{ .failure = .empty_char_literal };
|
||||
assert(slice[0] == '\'');
|
||||
assert(slice[slice.len - 1] == '\'');
|
||||
|
||||
switch (slice[1]) {
|
||||
'\\' => {
|
||||
@ -55,7 +58,18 @@ pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
|
||||
},
|
||||
0 => return .{ .failure = .{ .invalid_character = 1 } },
|
||||
else => {
|
||||
const codepoint = utf8Decode(slice[1 .. slice.len - 1]) catch unreachable;
|
||||
const inner = slice[1 .. slice.len - 1];
|
||||
const n = std.unicode.utf8ByteSequenceLength(inner[0]) catch return .{
|
||||
.failure = .{ .invalid_unicode_codepoint = 1 },
|
||||
};
|
||||
if (inner.len > n) return .{ .failure = .{ .expected_single_quote = 1 + n } };
|
||||
const codepoint = switch (n) {
|
||||
1 => inner[0],
|
||||
2 => std.unicode.utf8Decode2(inner[0..2].*),
|
||||
3 => std.unicode.utf8Decode3(inner[0..3].*),
|
||||
4 => std.unicode.utf8Decode4(inner[0..4].*),
|
||||
else => unreachable,
|
||||
} catch return .{ .failure = .{ .invalid_unicode_codepoint = 1 } };
|
||||
return .{ .success = codepoint };
|
||||
},
|
||||
}
|
||||
|
||||
@ -303,16 +303,16 @@ test "detect" {
|
||||
\\<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
\\<plist version="1.0">
|
||||
\\<dict>
|
||||
\\ <key>ProductBuildVersion</key>
|
||||
\\ <string>7W98</string>
|
||||
\\ <key>ProductCopyright</key>
|
||||
\\ <string>Apple Computer, Inc. 1983-2004</string>
|
||||
\\ <key>ProductName</key>
|
||||
\\ <string>Mac OS X</string>
|
||||
\\ <key>ProductUserVisibleVersion</key>
|
||||
\\ <string>10.3.9</string>
|
||||
\\ <key>ProductVersion</key>
|
||||
\\ <string>10.3.9</string>
|
||||
\\ <key>ProductBuildVersion</key>
|
||||
\\ <string>7W98</string>
|
||||
\\ <key>ProductCopyright</key>
|
||||
\\ <string>Apple Computer, Inc. 1983-2004</string>
|
||||
\\ <key>ProductName</key>
|
||||
\\ <string>Mac OS X</string>
|
||||
\\ <key>ProductUserVisibleVersion</key>
|
||||
\\ <string>10.3.9</string>
|
||||
\\ <key>ProductVersion</key>
|
||||
\\ <string>10.3.9</string>
|
||||
\\</dict>
|
||||
\\</plist>
|
||||
,
|
||||
@ -323,18 +323,18 @@ test "detect" {
|
||||
\\<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
\\<plist version="1.0">
|
||||
\\<dict>
|
||||
\\ <key>ProductBuildVersion</key>
|
||||
\\ <string>19G68</string>
|
||||
\\ <key>ProductCopyright</key>
|
||||
\\ <string>1983-2020 Apple Inc.</string>
|
||||
\\ <key>ProductName</key>
|
||||
\\ <string>Mac OS X</string>
|
||||
\\ <key>ProductUserVisibleVersion</key>
|
||||
\\ <string>10.15.6</string>
|
||||
\\ <key>ProductVersion</key>
|
||||
\\ <string>10.15.6</string>
|
||||
\\ <key>iOSSupportVersion</key>
|
||||
\\ <string>13.6</string>
|
||||
\\ <key>ProductBuildVersion</key>
|
||||
\\ <string>19G68</string>
|
||||
\\ <key>ProductCopyright</key>
|
||||
\\ <string>1983-2020 Apple Inc.</string>
|
||||
\\ <key>ProductName</key>
|
||||
\\ <string>Mac OS X</string>
|
||||
\\ <key>ProductUserVisibleVersion</key>
|
||||
\\ <string>10.15.6</string>
|
||||
\\ <key>ProductVersion</key>
|
||||
\\ <string>10.15.6</string>
|
||||
\\ <key>iOSSupportVersion</key>
|
||||
\\ <string>13.6</string>
|
||||
\\</dict>
|
||||
\\</plist>
|
||||
,
|
||||
@ -345,18 +345,18 @@ test "detect" {
|
||||
\\<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
\\<plist version="1.0">
|
||||
\\<dict>
|
||||
\\ <key>ProductBuildVersion</key>
|
||||
\\ <string>20A2408</string>
|
||||
\\ <key>ProductCopyright</key>
|
||||
\\ <string>1983-2020 Apple Inc.</string>
|
||||
\\ <key>ProductName</key>
|
||||
\\ <string>macOS</string>
|
||||
\\ <key>ProductUserVisibleVersion</key>
|
||||
\\ <string>11.0</string>
|
||||
\\ <key>ProductVersion</key>
|
||||
\\ <string>11.0</string>
|
||||
\\ <key>iOSSupportVersion</key>
|
||||
\\ <string>14.2</string>
|
||||
\\ <key>ProductBuildVersion</key>
|
||||
\\ <string>20A2408</string>
|
||||
\\ <key>ProductCopyright</key>
|
||||
\\ <string>1983-2020 Apple Inc.</string>
|
||||
\\ <key>ProductName</key>
|
||||
\\ <string>macOS</string>
|
||||
\\ <key>ProductUserVisibleVersion</key>
|
||||
\\ <string>11.0</string>
|
||||
\\ <key>ProductVersion</key>
|
||||
\\ <string>11.0</string>
|
||||
\\ <key>iOSSupportVersion</key>
|
||||
\\ <string>14.2</string>
|
||||
\\</dict>
|
||||
\\</plist>
|
||||
,
|
||||
@ -367,18 +367,18 @@ test "detect" {
|
||||
\\<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
\\<plist version="1.0">
|
||||
\\<dict>
|
||||
\\ <key>ProductBuildVersion</key>
|
||||
\\ <string>20C63</string>
|
||||
\\ <key>ProductCopyright</key>
|
||||
\\ <string>1983-2020 Apple Inc.</string>
|
||||
\\ <key>ProductName</key>
|
||||
\\ <string>macOS</string>
|
||||
\\ <key>ProductUserVisibleVersion</key>
|
||||
\\ <string>11.1</string>
|
||||
\\ <key>ProductVersion</key>
|
||||
\\ <string>11.1</string>
|
||||
\\ <key>iOSSupportVersion</key>
|
||||
\\ <string>14.3</string>
|
||||
\\ <key>ProductBuildVersion</key>
|
||||
\\ <string>20C63</string>
|
||||
\\ <key>ProductCopyright</key>
|
||||
\\ <string>1983-2020 Apple Inc.</string>
|
||||
\\ <key>ProductName</key>
|
||||
\\ <string>macOS</string>
|
||||
\\ <key>ProductUserVisibleVersion</key>
|
||||
\\ <string>11.1</string>
|
||||
\\ <key>ProductVersion</key>
|
||||
\\ <string>11.1</string>
|
||||
\\ <key>iOSSupportVersion</key>
|
||||
\\ <string>14.3</string>
|
||||
\\</dict>
|
||||
\\</plist>
|
||||
,
|
||||
|
||||
@ -109,12 +109,12 @@ const RiscvCpuinfoParser = CpuinfoParser(RiscvCpuinfoImpl);
|
||||
|
||||
test "cpuinfo: RISC-V" {
|
||||
try testParser(RiscvCpuinfoParser, .riscv64, &Target.riscv.cpu.sifive_u74,
|
||||
\\processor : 0
|
||||
\\hart : 1
|
||||
\\isa : rv64imafdc
|
||||
\\mmu : sv39
|
||||
\\isa-ext :
|
||||
\\uarch : sifive,u74-mc
|
||||
\\processor : 0
|
||||
\\hart : 1
|
||||
\\isa : rv64imafdc
|
||||
\\mmu : sv39
|
||||
\\isa-ext :
|
||||
\\uarch : sifive,u74-mc
|
||||
);
|
||||
}
|
||||
|
||||
@ -177,16 +177,16 @@ const PowerpcCpuinfoParser = CpuinfoParser(PowerpcCpuinfoImpl);
|
||||
|
||||
test "cpuinfo: PowerPC" {
|
||||
try testParser(PowerpcCpuinfoParser, .powerpc, &Target.powerpc.cpu.@"970",
|
||||
\\processor : 0
|
||||
\\cpu : PPC970MP, altivec supported
|
||||
\\clock : 1250.000000MHz
|
||||
\\revision : 1.1 (pvr 0044 0101)
|
||||
\\processor : 0
|
||||
\\cpu : PPC970MP, altivec supported
|
||||
\\clock : 1250.000000MHz
|
||||
\\revision : 1.1 (pvr 0044 0101)
|
||||
);
|
||||
try testParser(PowerpcCpuinfoParser, .powerpc64le, &Target.powerpc.cpu.pwr8,
|
||||
\\processor : 0
|
||||
\\cpu : POWER8 (raw), altivec supported
|
||||
\\clock : 2926.000000MHz
|
||||
\\revision : 2.0 (pvr 004d 0200)
|
||||
\\processor : 0
|
||||
\\cpu : POWER8 (raw), altivec supported
|
||||
\\clock : 2926.000000MHz
|
||||
\\revision : 2.0 (pvr 004d 0200)
|
||||
);
|
||||
}
|
||||
|
||||
@ -304,25 +304,25 @@ test "cpuinfo: ARM" {
|
||||
\\CPU revision : 7
|
||||
);
|
||||
try testParser(ArmCpuinfoParser, .arm, &Target.arm.cpu.cortex_a7,
|
||||
\\processor : 0
|
||||
\\model name : ARMv7 Processor rev 3 (v7l)
|
||||
\\BogoMIPS : 18.00
|
||||
\\Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae
|
||||
\\CPU implementer : 0x41
|
||||
\\processor : 0
|
||||
\\model name : ARMv7 Processor rev 3 (v7l)
|
||||
\\BogoMIPS : 18.00
|
||||
\\Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae
|
||||
\\CPU implementer : 0x41
|
||||
\\CPU architecture: 7
|
||||
\\CPU variant : 0x0
|
||||
\\CPU part : 0xc07
|
||||
\\CPU revision : 3
|
||||
\\CPU variant : 0x0
|
||||
\\CPU part : 0xc07
|
||||
\\CPU revision : 3
|
||||
\\
|
||||
\\processor : 4
|
||||
\\model name : ARMv7 Processor rev 3 (v7l)
|
||||
\\BogoMIPS : 90.00
|
||||
\\Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae
|
||||
\\CPU implementer : 0x41
|
||||
\\processor : 4
|
||||
\\model name : ARMv7 Processor rev 3 (v7l)
|
||||
\\BogoMIPS : 90.00
|
||||
\\Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae
|
||||
\\CPU implementer : 0x41
|
||||
\\CPU architecture: 7
|
||||
\\CPU variant : 0x2
|
||||
\\CPU part : 0xc0f
|
||||
\\CPU revision : 3
|
||||
\\CPU variant : 0x2
|
||||
\\CPU part : 0xc0f
|
||||
\\CPU revision : 3
|
||||
);
|
||||
try testParser(ArmCpuinfoParser, .aarch64, &Target.aarch64.cpu.cortex_a72,
|
||||
\\processor : 0
|
||||
|
||||
@ -320,7 +320,7 @@ pub const Token = struct {
|
||||
|
||||
pub fn symbol(tag: Tag) []const u8 {
|
||||
return tag.lexeme() orelse switch (tag) {
|
||||
.invalid => "invalid bytes",
|
||||
.invalid => "invalid token",
|
||||
.identifier => "an identifier",
|
||||
.string_literal, .multiline_string_literal_line => "a string literal",
|
||||
.char_literal => "a character literal",
|
||||
@ -338,22 +338,22 @@ pub const Tokenizer = struct {
|
||||
buffer: [:0]const u8,
|
||||
index: usize,
|
||||
|
||||
/// For debugging purposes
|
||||
/// For debugging purposes.
|
||||
pub fn dump(self: *Tokenizer, token: *const Token) void {
|
||||
std.debug.print("{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.start..token.loc.end] });
|
||||
}
|
||||
|
||||
pub fn init(buffer: [:0]const u8) Tokenizer {
|
||||
// Skip the UTF-8 BOM if present
|
||||
const src_start: usize = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0;
|
||||
return Tokenizer{
|
||||
// Skip the UTF-8 BOM if present.
|
||||
return .{
|
||||
.buffer = buffer,
|
||||
.index = src_start,
|
||||
.index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
|
||||
};
|
||||
}
|
||||
|
||||
const State = enum {
|
||||
start,
|
||||
expect_newline,
|
||||
identifier,
|
||||
builtin,
|
||||
string_literal,
|
||||
@ -361,10 +361,6 @@ pub const Tokenizer = struct {
|
||||
multiline_string_literal_line,
|
||||
char_literal,
|
||||
char_literal_backslash,
|
||||
char_literal_hex_escape,
|
||||
char_literal_unicode_escape_saw_u,
|
||||
char_literal_unicode_escape,
|
||||
char_literal_end,
|
||||
backslash,
|
||||
equal,
|
||||
bang,
|
||||
@ -400,30 +396,33 @@ pub const Tokenizer = struct {
|
||||
period_2,
|
||||
period_asterisk,
|
||||
saw_at_sign,
|
||||
invalid,
|
||||
};
|
||||
|
||||
/// After this returns invalid, it will reset on the next newline, returning tokens starting from there.
|
||||
/// An eof token will always be returned at the end.
|
||||
pub fn next(self: *Tokenizer) Token {
|
||||
var state: State = .start;
|
||||
var result = Token{
|
||||
.tag = .eof,
|
||||
var result: Token = .{
|
||||
.tag = undefined,
|
||||
.loc = .{
|
||||
.start = self.index,
|
||||
.end = undefined,
|
||||
},
|
||||
};
|
||||
var seen_escape_digits: usize = undefined;
|
||||
while (true) : (self.index += 1) {
|
||||
const c = self.buffer[self.index];
|
||||
switch (state) {
|
||||
.start => switch (c) {
|
||||
0 => {
|
||||
if (self.index != self.buffer.len) {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += 1;
|
||||
return result;
|
||||
}
|
||||
break;
|
||||
if (self.index == self.buffer.len) return .{
|
||||
.tag = .eof,
|
||||
.loc = .{
|
||||
.start = self.index,
|
||||
.end = self.index,
|
||||
},
|
||||
};
|
||||
state = .invalid;
|
||||
},
|
||||
' ', '\n', '\t', '\r' => {
|
||||
result.loc.start = self.index + 1;
|
||||
@ -434,6 +433,7 @@ pub const Tokenizer = struct {
|
||||
},
|
||||
'\'' => {
|
||||
state = .char_literal;
|
||||
result.tag = .char_literal;
|
||||
},
|
||||
'a'...'z', 'A'...'Z', '_' => {
|
||||
state = .identifier;
|
||||
@ -545,14 +545,44 @@ pub const Tokenizer = struct {
|
||||
result.tag = .number_literal;
|
||||
},
|
||||
else => {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += std.unicode.utf8ByteSequenceLength(c) catch 1;
|
||||
return result;
|
||||
state = .invalid;
|
||||
},
|
||||
},
|
||||
|
||||
.expect_newline => switch (c) {
|
||||
0 => {
|
||||
if (self.index == self.buffer.len) {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
}
|
||||
state = .invalid;
|
||||
},
|
||||
'\n' => {
|
||||
result.loc.start = self.index + 1;
|
||||
state = .start;
|
||||
},
|
||||
else => {
|
||||
state = .invalid;
|
||||
},
|
||||
},
|
||||
|
||||
.invalid => switch (c) {
|
||||
0 => if (self.index == self.buffer.len) {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
'\n' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
else => continue,
|
||||
},
|
||||
|
||||
.saw_at_sign => switch (c) {
|
||||
0, '\n' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
'"' => {
|
||||
result.tag = .identifier;
|
||||
state = .string_literal;
|
||||
@ -562,8 +592,7 @@ pub const Tokenizer = struct {
|
||||
result.tag = .builtin;
|
||||
},
|
||||
else => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
state = .invalid;
|
||||
},
|
||||
},
|
||||
|
||||
@ -698,7 +727,7 @@ pub const Tokenizer = struct {
|
||||
},
|
||||
|
||||
.identifier => switch (c) {
|
||||
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
|
||||
'a'...'z', 'A'...'Z', '_', '0'...'9' => continue,
|
||||
else => {
|
||||
if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |tag| {
|
||||
result.tag = tag;
|
||||
@ -707,26 +736,37 @@ pub const Tokenizer = struct {
|
||||
},
|
||||
},
|
||||
.builtin => switch (c) {
|
||||
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
|
||||
'a'...'z', 'A'...'Z', '_', '0'...'9' => continue,
|
||||
else => break,
|
||||
},
|
||||
.backslash => switch (c) {
|
||||
'\\' => {
|
||||
state = .multiline_string_literal_line;
|
||||
},
|
||||
else => {
|
||||
0 => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
'\\' => {
|
||||
state = .multiline_string_literal_line;
|
||||
},
|
||||
'\n' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
else => {
|
||||
state = .invalid;
|
||||
},
|
||||
},
|
||||
.string_literal => switch (c) {
|
||||
0, '\n' => {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
0 => {
|
||||
if (self.index != self.buffer.len) {
|
||||
self.index += 1;
|
||||
state = .invalid;
|
||||
continue;
|
||||
}
|
||||
return result;
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
'\n' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
'\\' => {
|
||||
state = .string_literal_backslash;
|
||||
@ -735,150 +775,74 @@ pub const Tokenizer = struct {
|
||||
self.index += 1;
|
||||
break;
|
||||
},
|
||||
else => {
|
||||
if (self.invalidCharacterLength()) |len| {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += len;
|
||||
return result;
|
||||
}
|
||||
|
||||
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
|
||||
0x01...0x09, 0x0b...0x1f, 0x7f => {
|
||||
state = .invalid;
|
||||
},
|
||||
else => continue,
|
||||
},
|
||||
|
||||
.string_literal_backslash => switch (c) {
|
||||
0, '\n' => {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
if (self.index != self.buffer.len) {
|
||||
self.index += 1;
|
||||
}
|
||||
return result;
|
||||
break;
|
||||
},
|
||||
else => {
|
||||
state = .string_literal;
|
||||
|
||||
if (self.invalidCharacterLength()) |len| {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += len;
|
||||
return result;
|
||||
}
|
||||
|
||||
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
|
||||
},
|
||||
},
|
||||
|
||||
.char_literal => switch (c) {
|
||||
0, '\n', '\'' => {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
0 => {
|
||||
if (self.index != self.buffer.len) {
|
||||
self.index += 1;
|
||||
state = .invalid;
|
||||
continue;
|
||||
}
|
||||
return result;
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
'\n' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
'\\' => {
|
||||
state = .char_literal_backslash;
|
||||
},
|
||||
else => {
|
||||
state = .char_literal_end;
|
||||
|
||||
if (self.invalidCharacterLength()) |len| {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += len;
|
||||
return result;
|
||||
}
|
||||
|
||||
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
|
||||
},
|
||||
},
|
||||
|
||||
.char_literal_backslash => switch (c) {
|
||||
0, '\n' => {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
if (self.index != self.buffer.len) {
|
||||
self.index += 1;
|
||||
}
|
||||
return result;
|
||||
},
|
||||
'x' => {
|
||||
state = .char_literal_hex_escape;
|
||||
seen_escape_digits = 0;
|
||||
},
|
||||
'u' => {
|
||||
state = .char_literal_unicode_escape_saw_u;
|
||||
},
|
||||
else => {
|
||||
state = .char_literal_end;
|
||||
|
||||
if (self.invalidCharacterLength()) |len| {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += len;
|
||||
return result;
|
||||
}
|
||||
|
||||
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
|
||||
},
|
||||
},
|
||||
|
||||
.char_literal_hex_escape => switch (c) {
|
||||
'0'...'9', 'a'...'f', 'A'...'F' => {
|
||||
seen_escape_digits += 1;
|
||||
if (seen_escape_digits == 2) {
|
||||
state = .char_literal_end;
|
||||
}
|
||||
},
|
||||
else => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
},
|
||||
|
||||
.char_literal_unicode_escape_saw_u => switch (c) {
|
||||
'{' => {
|
||||
state = .char_literal_unicode_escape;
|
||||
},
|
||||
else => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
},
|
||||
|
||||
.char_literal_unicode_escape => switch (c) {
|
||||
'0'...'9', 'a'...'f', 'A'...'F' => {},
|
||||
'}' => {
|
||||
state = .char_literal_end; // too many/few digits handled later
|
||||
},
|
||||
else => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
},
|
||||
|
||||
.char_literal_end => switch (c) {
|
||||
'\'' => {
|
||||
result.tag = .char_literal;
|
||||
self.index += 1;
|
||||
break;
|
||||
},
|
||||
else => {
|
||||
0x01...0x09, 0x0b...0x1f, 0x7f => {
|
||||
state = .invalid;
|
||||
},
|
||||
else => continue,
|
||||
},
|
||||
|
||||
.char_literal_backslash => switch (c) {
|
||||
0 => {
|
||||
if (self.index != self.buffer.len) {
|
||||
state = .invalid;
|
||||
continue;
|
||||
}
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
'\n' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
0x01...0x09, 0x0b...0x1f, 0x7f => {
|
||||
state = .invalid;
|
||||
},
|
||||
else => {
|
||||
state = .char_literal;
|
||||
},
|
||||
},
|
||||
|
||||
.multiline_string_literal_line => switch (c) {
|
||||
0 => {
|
||||
if (self.index != self.buffer.len) {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += 1;
|
||||
return result;
|
||||
state = .invalid;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
},
|
||||
@ -886,17 +850,18 @@ pub const Tokenizer = struct {
|
||||
self.index += 1;
|
||||
break;
|
||||
},
|
||||
'\t' => {},
|
||||
else => {
|
||||
if (self.invalidCharacterLength()) |len| {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += len;
|
||||
return result;
|
||||
'\r' => {
|
||||
if (self.buffer[self.index + 1] == '\n') {
|
||||
self.index += 2;
|
||||
break;
|
||||
} else {
|
||||
state = .invalid;
|
||||
}
|
||||
|
||||
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
|
||||
},
|
||||
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
|
||||
state = .invalid;
|
||||
},
|
||||
else => continue,
|
||||
},
|
||||
|
||||
.bang => switch (c) {
|
||||
@ -1113,12 +1078,16 @@ pub const Tokenizer = struct {
|
||||
.line_comment_start => switch (c) {
|
||||
0 => {
|
||||
if (self.index != self.buffer.len) {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += 1;
|
||||
return result;
|
||||
state = .invalid;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
return .{
|
||||
.tag = .eof,
|
||||
.loc = .{
|
||||
.start = self.index,
|
||||
.end = self.index,
|
||||
},
|
||||
};
|
||||
},
|
||||
'/' => {
|
||||
state = .doc_comment_start;
|
||||
@ -1127,105 +1096,91 @@ pub const Tokenizer = struct {
|
||||
result.tag = .container_doc_comment;
|
||||
state = .doc_comment;
|
||||
},
|
||||
'\r' => {
|
||||
state = .expect_newline;
|
||||
},
|
||||
'\n' => {
|
||||
state = .start;
|
||||
result.loc.start = self.index + 1;
|
||||
},
|
||||
'\t' => {
|
||||
state = .line_comment;
|
||||
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
|
||||
state = .invalid;
|
||||
},
|
||||
else => {
|
||||
state = .line_comment;
|
||||
|
||||
if (self.invalidCharacterLength()) |len| {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += len;
|
||||
return result;
|
||||
}
|
||||
|
||||
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
|
||||
},
|
||||
},
|
||||
.doc_comment_start => switch (c) {
|
||||
0, '\n' => {
|
||||
result.tag = .doc_comment;
|
||||
break;
|
||||
},
|
||||
'\r' => {
|
||||
if (self.buffer[self.index + 1] == '\n') {
|
||||
self.index += 1;
|
||||
result.tag = .doc_comment;
|
||||
break;
|
||||
} else {
|
||||
state = .invalid;
|
||||
}
|
||||
},
|
||||
'/' => {
|
||||
state = .line_comment;
|
||||
},
|
||||
0 => {
|
||||
if (self.index != self.buffer.len) {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += 1;
|
||||
return result;
|
||||
}
|
||||
result.tag = .doc_comment;
|
||||
break;
|
||||
},
|
||||
'\n' => {
|
||||
result.tag = .doc_comment;
|
||||
break;
|
||||
},
|
||||
'\t' => {
|
||||
state = .doc_comment;
|
||||
result.tag = .doc_comment;
|
||||
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
|
||||
state = .invalid;
|
||||
},
|
||||
else => {
|
||||
state = .doc_comment;
|
||||
result.tag = .doc_comment;
|
||||
|
||||
if (self.invalidCharacterLength()) |len| {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += len;
|
||||
return result;
|
||||
}
|
||||
|
||||
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
|
||||
},
|
||||
},
|
||||
.line_comment => switch (c) {
|
||||
0 => {
|
||||
if (self.index != self.buffer.len) {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += 1;
|
||||
return result;
|
||||
state = .invalid;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
return .{
|
||||
.tag = .eof,
|
||||
.loc = .{
|
||||
.start = self.index,
|
||||
.end = self.index,
|
||||
},
|
||||
};
|
||||
},
|
||||
'\r' => {
|
||||
state = .expect_newline;
|
||||
},
|
||||
'\n' => {
|
||||
state = .start;
|
||||
result.loc.start = self.index + 1;
|
||||
},
|
||||
'\t' => {},
|
||||
else => {
|
||||
if (self.invalidCharacterLength()) |len| {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += len;
|
||||
return result;
|
||||
}
|
||||
|
||||
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
|
||||
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
|
||||
state = .invalid;
|
||||
},
|
||||
else => continue,
|
||||
},
|
||||
.doc_comment => switch (c) {
|
||||
0, '\n' => break,
|
||||
'\t' => {},
|
||||
else => {
|
||||
if (self.invalidCharacterLength()) |len| {
|
||||
result.tag = .invalid;
|
||||
result.loc.end = self.index;
|
||||
self.index += len;
|
||||
return result;
|
||||
}
|
||||
|
||||
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
|
||||
0, '\n' => {
|
||||
break;
|
||||
},
|
||||
'\r' => {
|
||||
if (self.buffer[self.index + 1] == '\n') {
|
||||
self.index += 1;
|
||||
break;
|
||||
} else {
|
||||
state = .invalid;
|
||||
}
|
||||
},
|
||||
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
|
||||
state = .invalid;
|
||||
},
|
||||
else => continue,
|
||||
},
|
||||
.int => switch (c) {
|
||||
'.' => state = .int_period,
|
||||
'_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {},
|
||||
'_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => continue,
|
||||
'e', 'E', 'p', 'P' => state = .int_exponent,
|
||||
else => break,
|
||||
},
|
||||
@ -1249,7 +1204,7 @@ pub const Tokenizer = struct {
|
||||
},
|
||||
},
|
||||
.float => switch (c) {
|
||||
'_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {},
|
||||
'_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => continue,
|
||||
'e', 'E', 'p', 'P' => state = .float_exponent,
|
||||
else => break,
|
||||
},
|
||||
@ -1263,57 +1218,9 @@ pub const Tokenizer = struct {
|
||||
}
|
||||
}
|
||||
|
||||
if (result.tag == .eof) {
|
||||
result.loc.start = self.index;
|
||||
}
|
||||
|
||||
result.loc.end = self.index;
|
||||
return result;
|
||||
}
|
||||
|
||||
fn invalidCharacterLength(self: *Tokenizer) ?u3 {
|
||||
const c0 = self.buffer[self.index];
|
||||
if (std.ascii.isAscii(c0)) {
|
||||
if (c0 == '\r') {
|
||||
if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
|
||||
// Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
|
||||
// they constitute an illegal byte!
|
||||
return null;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
} else if (std.ascii.isControl(c0)) {
|
||||
// ascii control codes are never allowed
|
||||
// (note that \n was checked before we got here)
|
||||
return 1;
|
||||
}
|
||||
// looks fine to me.
|
||||
return null;
|
||||
} else {
|
||||
// check utf8-encoded character.
|
||||
const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1;
|
||||
if (self.index + length > self.buffer.len) {
|
||||
return @as(u3, @intCast(self.buffer.len - self.index));
|
||||
}
|
||||
const bytes = self.buffer[self.index .. self.index + length];
|
||||
switch (length) {
|
||||
2 => {
|
||||
const value = std.unicode.utf8Decode2(bytes) catch return length;
|
||||
if (value == 0x85) return length; // U+0085 (NEL)
|
||||
},
|
||||
3 => {
|
||||
const value = std.unicode.utf8Decode3(bytes) catch return length;
|
||||
if (value == 0x2028) return length; // U+2028 (LS)
|
||||
if (value == 0x2029) return length; // U+2029 (PS)
|
||||
},
|
||||
4 => {
|
||||
_ = std.unicode.utf8Decode4(bytes) catch return length;
|
||||
},
|
||||
else => unreachable,
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
test "keywords" {
|
||||
@ -1355,7 +1262,7 @@ test "code point literal with hex escape" {
|
||||
, &.{.char_literal});
|
||||
try testTokenize(
|
||||
\\'\x1'
|
||||
, &.{ .invalid, .invalid });
|
||||
, &.{.char_literal});
|
||||
}
|
||||
|
||||
test "newline in char literal" {
|
||||
@ -1396,40 +1303,30 @@ test "code point literal with unicode escapes" {
|
||||
// Invalid unicode escapes
|
||||
try testTokenize(
|
||||
\\'\u'
|
||||
, &.{ .invalid, .invalid });
|
||||
, &.{.char_literal});
|
||||
try testTokenize(
|
||||
\\'\u{{'
|
||||
, &.{ .invalid, .l_brace, .invalid });
|
||||
, &.{.char_literal});
|
||||
try testTokenize(
|
||||
\\'\u{}'
|
||||
, &.{.char_literal});
|
||||
try testTokenize(
|
||||
\\'\u{s}'
|
||||
, &.{
|
||||
.invalid,
|
||||
.identifier,
|
||||
.r_brace,
|
||||
.invalid,
|
||||
});
|
||||
, &.{.char_literal});
|
||||
try testTokenize(
|
||||
\\'\u{2z}'
|
||||
, &.{
|
||||
.invalid,
|
||||
.identifier,
|
||||
.r_brace,
|
||||
.invalid,
|
||||
});
|
||||
, &.{.char_literal});
|
||||
try testTokenize(
|
||||
\\'\u{4a'
|
||||
, &.{ .invalid, .invalid }); // 4a is valid
|
||||
, &.{.char_literal});
|
||||
|
||||
// Test old-style unicode literals
|
||||
try testTokenize(
|
||||
\\'\u0333'
|
||||
, &.{ .invalid, .number_literal, .invalid });
|
||||
, &.{.char_literal});
|
||||
try testTokenize(
|
||||
\\'\U0333'
|
||||
, &.{ .invalid, .number_literal, .invalid });
|
||||
, &.{.char_literal});
|
||||
}
|
||||
|
||||
test "code point literal with unicode code point" {
|
||||
@ -1465,24 +1362,15 @@ test "invalid token characters" {
|
||||
try testTokenize("`", &.{.invalid});
|
||||
try testTokenize("'c", &.{.invalid});
|
||||
try testTokenize("'", &.{.invalid});
|
||||
try testTokenize("''", &.{.invalid});
|
||||
try testTokenize("''", &.{.char_literal});
|
||||
try testTokenize("'\n'", &.{ .invalid, .invalid });
|
||||
}
|
||||
|
||||
test "invalid literal/comment characters" {
|
||||
try testTokenize("\"\x00\"", &.{
|
||||
.invalid,
|
||||
.invalid, // Incomplete string literal starting after invalid
|
||||
});
|
||||
try testTokenize("//\x00", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\x1f", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\x7f", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("\"\x00\"", &.{.invalid});
|
||||
try testTokenize("//\x00", &.{.invalid});
|
||||
try testTokenize("//\x1f", &.{.invalid});
|
||||
try testTokenize("//\x7f", &.{.invalid});
|
||||
}
|
||||
|
||||
test "utf8" {
|
||||
@ -1491,46 +1379,24 @@ test "utf8" {
|
||||
}
|
||||
|
||||
test "invalid utf8" {
|
||||
try testTokenize("//\x80", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\xbf", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\xf8", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\xff", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\xc2\xc0", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\xe0", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\xf0", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\xf0\x90\x80\xc0", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\x80", &.{});
|
||||
try testTokenize("//\xbf", &.{});
|
||||
try testTokenize("//\xf8", &.{});
|
||||
try testTokenize("//\xff", &.{});
|
||||
try testTokenize("//\xc2\xc0", &.{});
|
||||
try testTokenize("//\xe0", &.{});
|
||||
try testTokenize("//\xf0", &.{});
|
||||
try testTokenize("//\xf0\x90\x80\xc0", &.{});
|
||||
}
|
||||
|
||||
test "illegal unicode codepoints" {
|
||||
// unicode newline characters.U+0085, U+2028, U+2029
|
||||
try testTokenize("//\xc2\x84", &.{});
|
||||
try testTokenize("//\xc2\x85", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\xc2\x85", &.{});
|
||||
try testTokenize("//\xc2\x86", &.{});
|
||||
try testTokenize("//\xe2\x80\xa7", &.{});
|
||||
try testTokenize("//\xe2\x80\xa8", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\xe2\x80\xa9", &.{
|
||||
.invalid,
|
||||
});
|
||||
try testTokenize("//\xe2\x80\xa8", &.{});
|
||||
try testTokenize("//\xe2\x80\xa9", &.{});
|
||||
try testTokenize("//\xe2\x80\xaa", &.{});
|
||||
}
|
||||
|
||||
@ -1549,30 +1415,6 @@ test "string identifier and builtin fns" {
|
||||
});
|
||||
}
|
||||
|
||||
test "multiline string literal with literal tab" {
|
||||
try testTokenize(
|
||||
\\\\foo bar
|
||||
, &.{
|
||||
.multiline_string_literal_line,
|
||||
});
|
||||
}
|
||||
|
||||
test "comments with literal tab" {
|
||||
try testTokenize(
|
||||
\\//foo bar
|
||||
\\//!foo bar
|
||||
\\///foo bar
|
||||
\\// foo
|
||||
\\/// foo
|
||||
\\/// /foo
|
||||
, &.{
|
||||
.container_doc_comment,
|
||||
.doc_comment,
|
||||
.doc_comment,
|
||||
.doc_comment,
|
||||
});
|
||||
}
|
||||
|
||||
test "pipe and then invalid" {
|
||||
try testTokenize("||=", &.{
|
||||
.pipe_pipe,
|
||||
@ -1892,8 +1734,8 @@ test "multi line string literal with only 1 backslash" {
|
||||
}
|
||||
|
||||
test "invalid builtin identifiers" {
|
||||
try testTokenize("@()", &.{ .invalid, .l_paren, .r_paren });
|
||||
try testTokenize("@0()", &.{ .invalid, .number_literal, .l_paren, .r_paren });
|
||||
try testTokenize("@()", &.{.invalid});
|
||||
try testTokenize("@0()", &.{.invalid});
|
||||
}
|
||||
|
||||
test "invalid token with unfinished escape right before eof" {
|
||||
@ -1921,21 +1763,78 @@ test "saturating operators" {
|
||||
}
|
||||
|
||||
test "null byte before eof" {
|
||||
try testTokenize("123 \x00 456", &.{ .number_literal, .invalid, .number_literal });
|
||||
try testTokenize("123 \x00 456", &.{ .number_literal, .invalid });
|
||||
try testTokenize("//\x00", &.{.invalid});
|
||||
try testTokenize("\\\\\x00", &.{.invalid});
|
||||
try testTokenize("\x00", &.{.invalid});
|
||||
try testTokenize("// NUL\x00\n", &.{.invalid});
|
||||
try testTokenize("///\x00\n", &.{.invalid});
|
||||
try testTokenize("///\x00\n", &.{ .doc_comment, .invalid });
|
||||
try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid });
|
||||
}
|
||||
|
||||
test "invalid tabs and carriage returns" {
|
||||
// "Inside Line Comments and Documentation Comments, Any TAB is rejected by
|
||||
// the grammar since it is ambiguous how it should be rendered."
|
||||
// https://github.com/ziglang/zig-spec/issues/38
|
||||
try testTokenize("//\t", &.{.invalid});
|
||||
try testTokenize("// \t", &.{.invalid});
|
||||
try testTokenize("///\t", &.{.invalid});
|
||||
try testTokenize("/// \t", &.{.invalid});
|
||||
try testTokenize("//!\t", &.{.invalid});
|
||||
try testTokenize("//! \t", &.{.invalid});
|
||||
|
||||
// "Inside Line Comments and Documentation Comments, CR directly preceding
|
||||
// NL is unambiguously part of the newline sequence. It is accepted by the
|
||||
// grammar and removed by zig fmt, leaving only NL. CR anywhere else is
|
||||
// rejected by the grammar."
|
||||
// https://github.com/ziglang/zig-spec/issues/38
|
||||
try testTokenize("//\r", &.{.invalid});
|
||||
try testTokenize("// \r", &.{.invalid});
|
||||
try testTokenize("///\r", &.{.invalid});
|
||||
try testTokenize("/// \r", &.{.invalid});
|
||||
try testTokenize("//\r ", &.{.invalid});
|
||||
try testTokenize("// \r ", &.{.invalid});
|
||||
try testTokenize("///\r ", &.{.invalid});
|
||||
try testTokenize("/// \r ", &.{.invalid});
|
||||
try testTokenize("//\r\n", &.{});
|
||||
try testTokenize("// \r\n", &.{});
|
||||
try testTokenize("///\r\n", &.{.doc_comment});
|
||||
try testTokenize("/// \r\n", &.{.doc_comment});
|
||||
try testTokenize("//!\r", &.{.invalid});
|
||||
try testTokenize("//! \r", &.{.invalid});
|
||||
try testTokenize("//!\r ", &.{.invalid});
|
||||
try testTokenize("//! \r ", &.{.invalid});
|
||||
try testTokenize("//!\r\n", &.{.container_doc_comment});
|
||||
try testTokenize("//! \r\n", &.{.container_doc_comment});
|
||||
|
||||
// The control characters TAB and CR are rejected by the grammar inside multi-line string literals,
|
||||
// except if CR is directly before NL.
|
||||
// https://github.com/ziglang/zig-spec/issues/38
|
||||
try testTokenize("\\\\\r", &.{.invalid});
|
||||
try testTokenize("\\\\\r ", &.{.invalid});
|
||||
try testTokenize("\\\\ \r", &.{.invalid});
|
||||
try testTokenize("\\\\\t", &.{.invalid});
|
||||
try testTokenize("\\\\\t ", &.{.invalid});
|
||||
try testTokenize("\\\\ \t", &.{.invalid});
|
||||
try testTokenize("\\\\\r\n", &.{.multiline_string_literal_line});
|
||||
|
||||
// "TAB used as whitespace is...accepted by the grammar. CR used as
|
||||
// whitespace, whether directly preceding NL or stray, is...accepted by the
|
||||
// grammar."
|
||||
// https://github.com/ziglang/zig-spec/issues/38
|
||||
try testTokenize("\tpub\tswitch\t", &.{ .keyword_pub, .keyword_switch });
|
||||
try testTokenize("\rpub\rswitch\r", &.{ .keyword_pub, .keyword_switch });
|
||||
}
|
||||
|
||||
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
|
||||
var tokenizer = Tokenizer.init(source);
|
||||
for (expected_token_tags) |expected_token_tag| {
|
||||
const token = tokenizer.next();
|
||||
try std.testing.expectEqual(expected_token_tag, token.tag);
|
||||
}
|
||||
// Last token should always be eof, even when the last token was invalid,
|
||||
// in which case the tokenizer is in an invalid state, which can only be
|
||||
// recovered by opinionated means outside the scope of this implementation.
|
||||
const last_token = tokenizer.next();
|
||||
try std.testing.expectEqual(Token.Tag.eof, last_token.tag);
|
||||
try std.testing.expectEqual(source.len, last_token.loc.start);
|
||||
|
||||
@ -549,6 +549,9 @@ const Parse = struct {
|
||||
.{raw_string[bad_index]},
|
||||
);
|
||||
},
|
||||
.empty_char_literal => {
|
||||
try p.appendErrorOff(token, offset, "empty character literal", .{});
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -237,10 +237,7 @@ test "double quoted string" {
|
||||
try testing.expectEqualStrings(
|
||||
\\"here" are some escaped quotes
|
||||
, arr[1]);
|
||||
try testing.expectEqualStrings(
|
||||
\\newlines and tabs
|
||||
\\are supported
|
||||
, arr[2]);
|
||||
try testing.expectEqualStrings("newlines and tabs\nare\tsupported", arr[2]);
|
||||
try testing.expectEqualStrings(
|
||||
\\let's have
|
||||
\\some fun!
|
||||
|
||||
9
test/cases/compile_errors/empty_char_lit.zig
Normal file
9
test/cases/compile_errors/empty_char_lit.zig
Normal file
@ -0,0 +1,9 @@
|
||||
export fn entry() u8 {
|
||||
return '';
|
||||
}
|
||||
|
||||
// error
|
||||
// backend=stage2
|
||||
// target=native
|
||||
//
|
||||
// :2:12: error: empty character literal
|
||||
@ -6,5 +6,4 @@ export fn entry() void {
|
||||
// backend=stage2
|
||||
// target=native
|
||||
//
|
||||
// :2:15: error: expected expression, found 'invalid bytes'
|
||||
// :2:18: note: invalid byte: '1'
|
||||
// :2:17: error: invalid escape character: 'U'
|
||||
|
||||
@ -6,6 +6,5 @@ export fn entry() void {
|
||||
// backend=stage2
|
||||
// target=native
|
||||
//
|
||||
// :2:15: error: expected expression, found 'invalid bytes'
|
||||
// :2:21: note: invalid byte: 'z'
|
||||
// :2:21: error: expected hex digit or '}', found 'z'
|
||||
|
||||
|
||||
@ -5,5 +5,4 @@ b";
|
||||
// backend=stage2
|
||||
// target=native
|
||||
//
|
||||
// :1:13: error: expected expression, found 'invalid bytes'
|
||||
// :1:15: note: invalid byte: '\n'
|
||||
// :1:13: error: expected expression, found 'invalid token'
|
||||
|
||||
@ -38,15 +38,6 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host);
|
||||
|
||||
case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{
|
||||
":1:13: error: expected expression, found 'invalid bytes'",
|
||||
":1:19: note: invalid byte: '\\r'",
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
const case = ctx.obj("missing semicolon at EOF", b.graph.host);
|
||||
case.addError(
|
||||
@ -179,8 +170,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
|
||||
\\ return true;
|
||||
\\}
|
||||
, &[_][]const u8{
|
||||
":1:1: error: expected type expression, found 'invalid bytes'",
|
||||
":1:1: note: invalid byte: '\\xff'",
|
||||
":1:1: error: expected type expression, found 'invalid token'",
|
||||
});
|
||||
}
|
||||
|
||||
@ -222,8 +212,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
|
||||
const case = ctx.obj("invalid byte in string", b.graph.host);
|
||||
|
||||
case.addError("_ = \"\x01Q\";", &[_][]const u8{
|
||||
":1:5: error: expected expression, found 'invalid bytes'",
|
||||
":1:6: note: invalid byte: '\\x01'",
|
||||
":1:5: error: expected expression, found 'invalid token'",
|
||||
});
|
||||
}
|
||||
|
||||
@ -231,8 +220,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
|
||||
const case = ctx.obj("invalid byte in comment", b.graph.host);
|
||||
|
||||
case.addError("//\x01Q", &[_][]const u8{
|
||||
":1:1: error: expected type expression, found 'invalid bytes'",
|
||||
":1:3: note: invalid byte: '\\x01'",
|
||||
":1:1: error: expected type expression, found 'invalid token'",
|
||||
});
|
||||
}
|
||||
|
||||
@ -240,8 +228,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
|
||||
const case = ctx.obj("control character in character literal", b.graph.host);
|
||||
|
||||
case.addError("const c = '\x01';", &[_][]const u8{
|
||||
":1:11: error: expected expression, found 'invalid bytes'",
|
||||
":1:12: note: invalid byte: '\\x01'",
|
||||
":1:11: error: expected expression, found 'invalid token'",
|
||||
});
|
||||
}
|
||||
|
||||
@ -249,8 +236,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
|
||||
const case = ctx.obj("invalid byte at start of token", b.graph.host);
|
||||
|
||||
case.addError("x = \x00Q", &[_][]const u8{
|
||||
":1:5: error: expected expression, found 'invalid bytes'",
|
||||
":1:5: note: invalid byte: '\\x00'",
|
||||
":1:5: error: expected expression, found 'invalid token'",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@ -26,17 +26,17 @@ pub fn addCases(cases: *tests.RunTranslatedCContext) void {
|
||||
\\void baz(void);
|
||||
\\struct foo { int x; };
|
||||
\\void bar() {
|
||||
\\ struct foo tmp;
|
||||
\\ struct foo tmp;
|
||||
\\}
|
||||
\\
|
||||
\\void baz() {
|
||||
\\ struct foo tmp;
|
||||
\\ struct foo tmp;
|
||||
\\}
|
||||
\\
|
||||
\\int main(void) {
|
||||
\\ bar();
|
||||
\\ baz();
|
||||
\\ return 0;
|
||||
\\ bar();
|
||||
\\ baz();
|
||||
\\ return 0;
|
||||
\\}
|
||||
, "");
|
||||
|
||||
@ -53,7 +53,7 @@ pub fn addCases(cases: *tests.RunTranslatedCContext) void {
|
||||
cases.add("parenthesized string literal",
|
||||
\\void foo(const char *s) {}
|
||||
\\int main(void) {
|
||||
\\ foo(("bar"));
|
||||
\\ foo(("bar"));
|
||||
\\}
|
||||
, "");
|
||||
|
||||
|
||||
@ -133,20 +133,20 @@ pub fn addCases(cases: *tests.TranslateCContext) void {
|
||||
|
||||
cases.add("scoped typedef",
|
||||
\\void foo() {
|
||||
\\ typedef union {
|
||||
\\ int A;
|
||||
\\ int B;
|
||||
\\ int C;
|
||||
\\ } Foo;
|
||||
\\ Foo a = {0};
|
||||
\\ {
|
||||
\\ typedef union {
|
||||
\\ int A;
|
||||
\\ int B;
|
||||
\\ int C;
|
||||
\\ } Foo;
|
||||
\\ Foo a = {0};
|
||||
\\ }
|
||||
\\ typedef union {
|
||||
\\ int A;
|
||||
\\ int B;
|
||||
\\ int C;
|
||||
\\ } Foo;
|
||||
\\ Foo a = {0};
|
||||
\\ {
|
||||
\\ typedef union {
|
||||
\\ int A;
|
||||
\\ int B;
|
||||
\\ int C;
|
||||
\\ } Foo;
|
||||
\\ Foo a = {0};
|
||||
\\ }
|
||||
\\}
|
||||
, &[_][]const u8{
|
||||
\\pub export fn foo() void {
|
||||
@ -2004,18 +2004,18 @@ pub fn addCases(cases: *tests.TranslateCContext) void {
|
||||
\\ break;
|
||||
\\ }
|
||||
\\ case 4:
|
||||
\\ case 5:
|
||||
\\ case 5:
|
||||
\\ res = 69;
|
||||
\\ {
|
||||
\\ res = 5;
|
||||
\\ return;
|
||||
\\ return;
|
||||
\\ }
|
||||
\\ case 6:
|
||||
\\ switch (res) {
|
||||
\\ case 9: break;
|
||||
\\ }
|
||||
\\ res = 1;
|
||||
\\ return;
|
||||
\\ return;
|
||||
\\ }
|
||||
\\}
|
||||
, &[_][]const u8{
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user