mirror of
https://github.com/ziglang/zig.git
synced 2026-02-13 12:59:04 +00:00
[self-hosted] tokenizer error for ascii control codes
This commit is contained in:
parent
45ab752f9a
commit
0082989f22
@ -141,6 +141,7 @@ pub const Tokenizer = struct {
|
||||
buffer: []const u8,
|
||||
index: usize,
|
||||
actual_file_end: usize,
|
||||
pending_invalid_token: ?Token,
|
||||
|
||||
pub const Location = struct {
|
||||
line: usize,
|
||||
@ -179,24 +180,18 @@ pub const Tokenizer = struct {
|
||||
}
|
||||
|
||||
pub fn init(buffer: []const u8) -> Tokenizer {
|
||||
if (buffer.len == 0 or buffer[buffer.len - 1] == '\n') {
|
||||
return Tokenizer {
|
||||
.buffer = buffer,
|
||||
.index = 0,
|
||||
.actual_file_end = buffer.len,
|
||||
};
|
||||
} else {
|
||||
var source_len = buffer.len;
|
||||
while (source_len > 0) : (source_len -= 1) {
|
||||
if (buffer[source_len - 1] == '\n') break;
|
||||
// last line is incomplete, so skip it, and give an error when we get there.
|
||||
var source_len = buffer.len;
|
||||
while (source_len > 0) : (source_len -= 1) {
|
||||
if (buffer[source_len - 1] == '\n') break;
|
||||
}
|
||||
return Tokenizer {
|
||||
.buffer = buffer[0..source_len],
|
||||
.index = 0,
|
||||
.actual_file_end = buffer.len,
|
||||
};
|
||||
}
|
||||
|
||||
return Tokenizer {
|
||||
.buffer = buffer[0..source_len],
|
||||
.index = 0,
|
||||
.actual_file_end = buffer.len,
|
||||
.pending_invalid_token = null,
|
||||
};
|
||||
}
|
||||
|
||||
const State = enum {
|
||||
@ -223,6 +218,10 @@ pub const Tokenizer = struct {
|
||||
};
|
||||
|
||||
pub fn next(self: &Tokenizer) -> Token {
|
||||
if (self.pending_invalid_token) |token| {
|
||||
self.pending_invalid_token = null;
|
||||
return token;
|
||||
}
|
||||
var state = State.Start;
|
||||
var result = Token {
|
||||
.id = Token.Id.Eof,
|
||||
@ -368,7 +367,7 @@ pub const Tokenizer = struct {
|
||||
break;
|
||||
},
|
||||
'\n' => break, // Look for this error later.
|
||||
else => {},
|
||||
else => self.checkLiteralCharacter(),
|
||||
},
|
||||
|
||||
State.StringLiteralBackslash => switch (c) {
|
||||
@ -455,7 +454,7 @@ pub const Tokenizer = struct {
|
||||
.end = undefined,
|
||||
};
|
||||
},
|
||||
else => {},
|
||||
else => self.checkLiteralCharacter(),
|
||||
},
|
||||
State.Zero => switch (c) {
|
||||
'b', 'o', 'x' => {
|
||||
@ -513,10 +512,16 @@ pub const Tokenizer = struct {
|
||||
}
|
||||
}
|
||||
result.end = self.index;
|
||||
if (result.id == Token.Id.Eof and self.actual_file_end != self.buffer.len) {
|
||||
// instead of an Eof, give an error token
|
||||
result.id = Token.Id.NoEolAtEof;
|
||||
result.end = self.actual_file_end;
|
||||
if (result.id == Token.Id.Eof) {
|
||||
if (self.pending_invalid_token) |token| {
|
||||
self.pending_invalid_token = null;
|
||||
return token;
|
||||
}
|
||||
if (self.actual_file_end != self.buffer.len) {
|
||||
// instead of an Eof, give an error token
|
||||
result.id = Token.Id.NoEolAtEof;
|
||||
result.end = self.actual_file_end;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@ -524,12 +529,29 @@ pub const Tokenizer = struct {
|
||||
pub fn getTokenSlice(self: &const Tokenizer, token: &const Token) -> []const u8 {
|
||||
return self.buffer[token.start..token.end];
|
||||
}
|
||||
|
||||
fn checkLiteralCharacter(self: &Tokenizer) {
|
||||
if (self.pending_invalid_token != null) return;
|
||||
const c0 = self.buffer[self.index];
|
||||
if (c0 < 0x20 or c0 == 0x7f) {
|
||||
// ascii control codes are never allowed
|
||||
// (note that \n was checked before we got here)
|
||||
self.pending_invalid_token = Token {
|
||||
.id = Token.Id.Invalid,
|
||||
.start = self.index,
|
||||
.end = self.index + 1,
|
||||
};
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
test "tokenizer" {
|
||||
// source must end with eol
|
||||
testTokenize("", []Token.Id {
|
||||
}, true);
|
||||
testTokenize("no newline", []Token.Id {
|
||||
}, false);
|
||||
testTokenize("test\n", []Token.Id {
|
||||
@ -538,6 +560,29 @@ test "tokenizer" {
|
||||
testTokenize("test\nno newline", []Token.Id {
|
||||
Token.Id.Keyword_test,
|
||||
}, false);
|
||||
|
||||
// invalid token characters
|
||||
testTokenize("#\n", []Token.Id {
|
||||
Token.Id.Invalid,
|
||||
}, true);
|
||||
testTokenize("`\n", []Token.Id {
|
||||
Token.Id.Invalid,
|
||||
}, true);
|
||||
|
||||
// invalid literal/comment characters
|
||||
testTokenize("\"\x00\"\n", []Token.Id {
|
||||
Token.Id { .StringLiteral = Token.StrLitKind.Normal },
|
||||
Token.Id.Invalid,
|
||||
}, true);
|
||||
testTokenize("//\x00\n", []Token.Id {
|
||||
Token.Id.Invalid,
|
||||
}, true);
|
||||
testTokenize("//\x1f\n", []Token.Id {
|
||||
Token.Id.Invalid,
|
||||
}, true);
|
||||
testTokenize("//\x7f\n", []Token.Id {
|
||||
Token.Id.Invalid,
|
||||
}, true);
|
||||
}
|
||||
|
||||
fn testTokenize(source: []const u8, expected_tokens: []const Token.Id, expected_eol_at_eof: bool) {
|
||||
@ -546,8 +591,8 @@ fn testTokenize(source: []const u8, expected_tokens: []const Token.Id, expected_
|
||||
const token = tokenizer.next();
|
||||
std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id));
|
||||
switch (expected_token_id) {
|
||||
Token.Id.StringLiteral => |kind| {
|
||||
@panic("TODO: how do i test this?");
|
||||
Token.Id.StringLiteral => |expected_kind| {
|
||||
std.debug.assert(expected_kind == switch (token.id) { Token.Id.StringLiteral => |kind| kind, else => unreachable });
|
||||
},
|
||||
else => {},
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user