mirror of
https://github.com/ziglang/zig.git
synced 2026-01-02 03:25:01 +00:00
Recognize & skip the UTF-8 BOM
This commit is contained in:
parent
0eddee449d
commit
f36b8fd7b2
@ -407,9 +407,14 @@ void tokenize(Buf *buf, Tokenization *out) {
|
||||
t.buf = buf;
|
||||
|
||||
out->line_offsets = allocate<ZigList<size_t>>(1);
|
||||
|
||||
out->line_offsets->append(0);
|
||||
for (t.pos = 0; t.pos < buf_len(t.buf); t.pos += 1) {
|
||||
|
||||
// Skip the UTF-8 BOM if present
|
||||
if (buf_starts_with_mem(buf, "\xEF\xBB\xBF", 3)) {
|
||||
t.pos += 3;
|
||||
}
|
||||
|
||||
for (; t.pos < buf_len(t.buf); t.pos += 1) {
|
||||
uint8_t c = buf_ptr(t.buf)[t.pos];
|
||||
switch (t.state) {
|
||||
case TokenizeStateError:
|
||||
|
||||
@ -222,9 +222,11 @@ pub const Tokenizer = struct {
|
||||
},
|
||||
};
|
||||
} else {
|
||||
// Skip the UTF-8 BOM if present
|
||||
const src_start = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else usize(0);
|
||||
return Tokenizer{
|
||||
.buffer = buffer,
|
||||
.index = 0,
|
||||
.index = src_start,
|
||||
.pending_invalid_token = null,
|
||||
};
|
||||
}
|
||||
@ -1455,6 +1457,13 @@ test "tokenizer - line comment followed by identifier" {
|
||||
});
|
||||
}
|
||||
|
||||
test "tokenizer - UTF-8 BOM is recognized and skipped" {
|
||||
testTokenize("\xEF\xBB\xBFa;\n", [_]Token.Id{
|
||||
Token.Id.Identifier,
|
||||
Token.Id.Semicolon,
|
||||
});
|
||||
}
|
||||
|
||||
fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void {
|
||||
var tokenizer = Tokenizer.init(source);
|
||||
for (expected_tokens) |expected_token_id| {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user