mirror of
https://github.com/ziglang/zig.git
synced 2025-12-09 15:53:08 +00:00
stage2: tokenizer: require null terminated source
By requiring the source file to be null-terminated, we avoid extra branching while simplifying the logic at the same time. Running ast-check on a large zig source file (udivmodti4_test.zig), master branch compared to this commit: * 4% faster wall clock * 7% fewer cache misses * 1% fewer branches
This commit is contained in:
parent
8ce880ca75
commit
3f680abbe2
@ -20,7 +20,7 @@ pub const NodeList = std.MultiArrayList(Node);
|
||||
|
||||
pub const Tree = struct {
|
||||
/// Reference to externally-owned data.
|
||||
source: []const u8,
|
||||
source: [:0]const u8,
|
||||
|
||||
tokens: TokenList.Slice,
|
||||
/// The root AST node is assumed to be index 0. Since there can be no
|
||||
|
||||
@ -17,7 +17,7 @@ pub const Error = error{ParseError} || Allocator.Error;
|
||||
|
||||
/// Result should be freed with tree.deinit() when there are
|
||||
/// no more references to any of the tokens or nodes.
|
||||
pub fn parse(gpa: *Allocator, source: []const u8) Allocator.Error!Tree {
|
||||
pub fn parse(gpa: *Allocator, source: [:0]const u8) Allocator.Error!Tree {
|
||||
var tokens = ast.TokenList{};
|
||||
defer tokens.deinit(gpa);
|
||||
|
||||
|
||||
@ -5194,7 +5194,7 @@ const maxInt = std.math.maxInt;
|
||||
|
||||
var fixed_buffer_mem: [100 * 1024]u8 = undefined;
|
||||
|
||||
fn testParse(source: []const u8, allocator: *mem.Allocator, anything_changed: *bool) ![]u8 {
|
||||
fn testParse(source: [:0]const u8, allocator: *mem.Allocator, anything_changed: *bool) ![]u8 {
|
||||
const stderr = io.getStdErr().writer();
|
||||
|
||||
var tree = try std.zig.parse(allocator, source);
|
||||
@ -5222,7 +5222,7 @@ fn testParse(source: []const u8, allocator: *mem.Allocator, anything_changed: *b
|
||||
anything_changed.* = !mem.eql(u8, formatted, source);
|
||||
return formatted;
|
||||
}
|
||||
fn testTransform(source: []const u8, expected_source: []const u8) !void {
|
||||
fn testTransform(source: [:0]const u8, expected_source: []const u8) !void {
|
||||
const needed_alloc_count = x: {
|
||||
// Try it once with unlimited memory, make sure it works
|
||||
var fixed_allocator = std.heap.FixedBufferAllocator.init(fixed_buffer_mem[0..]);
|
||||
@ -5268,13 +5268,13 @@ fn testTransform(source: []const u8, expected_source: []const u8) !void {
|
||||
}
|
||||
}
|
||||
}
|
||||
fn testCanonical(source: []const u8) !void {
|
||||
fn testCanonical(source: [:0]const u8) !void {
|
||||
return testTransform(source, source);
|
||||
}
|
||||
|
||||
const Error = std.zig.ast.Error.Tag;
|
||||
|
||||
fn testError(source: []const u8, expected_errors: []const Error) !void {
|
||||
fn testError(source: [:0]const u8, expected_errors: []const Error) !void {
|
||||
var tree = try std.zig.parse(std.testing.allocator, source);
|
||||
defer tree.deinit(std.testing.allocator);
|
||||
|
||||
|
||||
@ -326,7 +326,7 @@ pub const Token = struct {
|
||||
};
|
||||
|
||||
pub const Tokenizer = struct {
|
||||
buffer: []const u8,
|
||||
buffer: [:0]const u8,
|
||||
index: usize,
|
||||
pending_invalid_token: ?Token,
|
||||
|
||||
@ -335,7 +335,7 @@ pub const Tokenizer = struct {
|
||||
std.debug.warn("{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.start..token.end] });
|
||||
}
|
||||
|
||||
pub fn init(buffer: []const u8) Tokenizer {
|
||||
pub fn init(buffer: [:0]const u8) Tokenizer {
|
||||
// Skip the UTF-8 BOM if present
|
||||
const src_start = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else @as(usize, 0);
|
||||
return Tokenizer{
|
||||
@ -373,7 +373,6 @@ pub const Tokenizer = struct {
|
||||
line_comment,
|
||||
doc_comment_start,
|
||||
doc_comment,
|
||||
container_doc_comment,
|
||||
zero,
|
||||
int_literal_dec,
|
||||
int_literal_dec_no_underscore,
|
||||
@ -407,10 +406,6 @@ pub const Tokenizer = struct {
|
||||
saw_at_sign,
|
||||
};
|
||||
|
||||
fn isIdentifierChar(char: u8) bool {
|
||||
return std.ascii.isAlNum(char) or char == '_';
|
||||
}
|
||||
|
||||
pub fn next(self: *Tokenizer) Token {
|
||||
if (self.pending_invalid_token) |token| {
|
||||
self.pending_invalid_token = null;
|
||||
@ -426,10 +421,11 @@ pub const Tokenizer = struct {
|
||||
};
|
||||
var seen_escape_digits: usize = undefined;
|
||||
var remaining_code_units: usize = undefined;
|
||||
while (self.index < self.buffer.len) : (self.index += 1) {
|
||||
while (true) : (self.index += 1) {
|
||||
const c = self.buffer[self.index];
|
||||
switch (state) {
|
||||
.start => switch (c) {
|
||||
0 => break,
|
||||
' ', '\n', '\t', '\r' => {
|
||||
result.loc.start = self.index + 1;
|
||||
},
|
||||
@ -705,18 +701,22 @@ pub const Tokenizer = struct {
|
||||
self.index += 1;
|
||||
break;
|
||||
},
|
||||
'\n', '\r' => break, // Look for this error later.
|
||||
0, '\n', '\r' => break, // Look for this error later.
|
||||
else => self.checkLiteralCharacter(),
|
||||
},
|
||||
|
||||
.string_literal_backslash => switch (c) {
|
||||
'\n', '\r' => break, // Look for this error later.
|
||||
0, '\n', '\r' => break, // Look for this error later.
|
||||
else => {
|
||||
state = .string_literal;
|
||||
},
|
||||
},
|
||||
|
||||
.char_literal => switch (c) {
|
||||
0 => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
'\\' => {
|
||||
state = .char_literal_backslash;
|
||||
},
|
||||
@ -742,7 +742,7 @@ pub const Tokenizer = struct {
|
||||
},
|
||||
|
||||
.char_literal_backslash => switch (c) {
|
||||
'\n' => {
|
||||
0, '\n' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
@ -834,6 +834,7 @@ pub const Tokenizer = struct {
|
||||
},
|
||||
|
||||
.multiline_string_literal_line => switch (c) {
|
||||
0 => break,
|
||||
'\n' => {
|
||||
self.index += 1;
|
||||
break;
|
||||
@ -1025,12 +1026,13 @@ pub const Tokenizer = struct {
|
||||
},
|
||||
},
|
||||
.line_comment_start => switch (c) {
|
||||
0 => break,
|
||||
'/' => {
|
||||
state = .doc_comment_start;
|
||||
},
|
||||
'!' => {
|
||||
result.tag = .container_doc_comment;
|
||||
state = .container_doc_comment;
|
||||
state = .doc_comment;
|
||||
},
|
||||
'\n' => {
|
||||
state = .start;
|
||||
@ -1046,7 +1048,7 @@ pub const Tokenizer = struct {
|
||||
'/' => {
|
||||
state = .line_comment;
|
||||
},
|
||||
'\n' => {
|
||||
0, '\n' => {
|
||||
result.tag = .doc_comment;
|
||||
break;
|
||||
},
|
||||
@ -1061,6 +1063,7 @@ pub const Tokenizer = struct {
|
||||
},
|
||||
},
|
||||
.line_comment => switch (c) {
|
||||
0 => break,
|
||||
'\n' => {
|
||||
state = .start;
|
||||
result.loc.start = self.index + 1;
|
||||
@ -1068,8 +1071,8 @@ pub const Tokenizer = struct {
|
||||
'\t', '\r' => {},
|
||||
else => self.checkLiteralCharacter(),
|
||||
},
|
||||
.doc_comment, .container_doc_comment => switch (c) {
|
||||
'\n' => break,
|
||||
.doc_comment => switch (c) {
|
||||
0, '\n' => break,
|
||||
'\t', '\r' => {},
|
||||
else => self.checkLiteralCharacter(),
|
||||
},
|
||||
@ -1088,12 +1091,11 @@ pub const Tokenizer = struct {
|
||||
self.index -= 1;
|
||||
state = .int_literal_dec;
|
||||
},
|
||||
else => {
|
||||
if (isIdentifierChar(c)) {
|
||||
result.tag = .invalid;
|
||||
}
|
||||
'a', 'c', 'd', 'f'...'n', 'p'...'w', 'y', 'z', 'A'...'D', 'F'...'Z' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
else => break,
|
||||
},
|
||||
.int_literal_bin_no_underscore => switch (c) {
|
||||
'0'...'1' => {
|
||||
@ -1109,12 +1111,11 @@ pub const Tokenizer = struct {
|
||||
state = .int_literal_bin_no_underscore;
|
||||
},
|
||||
'0'...'1' => {},
|
||||
else => {
|
||||
if (isIdentifierChar(c)) {
|
||||
result.tag = .invalid;
|
||||
}
|
||||
'2'...'9', 'a'...'z', 'A'...'Z' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
else => break,
|
||||
},
|
||||
.int_literal_oct_no_underscore => switch (c) {
|
||||
'0'...'7' => {
|
||||
@ -1130,12 +1131,11 @@ pub const Tokenizer = struct {
|
||||
state = .int_literal_oct_no_underscore;
|
||||
},
|
||||
'0'...'7' => {},
|
||||
else => {
|
||||
if (isIdentifierChar(c)) {
|
||||
result.tag = .invalid;
|
||||
}
|
||||
'8', '9', 'a'...'z', 'A'...'Z' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
else => break,
|
||||
},
|
||||
.int_literal_dec_no_underscore => switch (c) {
|
||||
'0'...'9' => {
|
||||
@ -1159,12 +1159,11 @@ pub const Tokenizer = struct {
|
||||
result.tag = .float_literal;
|
||||
},
|
||||
'0'...'9' => {},
|
||||
else => {
|
||||
if (isIdentifierChar(c)) {
|
||||
result.tag = .invalid;
|
||||
}
|
||||
'a'...'d', 'f'...'z', 'A'...'D', 'F'...'Z' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
else => break,
|
||||
},
|
||||
.int_literal_hex_no_underscore => switch (c) {
|
||||
'0'...'9', 'a'...'f', 'A'...'F' => {
|
||||
@ -1188,12 +1187,11 @@ pub const Tokenizer = struct {
|
||||
result.tag = .float_literal;
|
||||
},
|
||||
'0'...'9', 'a'...'f', 'A'...'F' => {},
|
||||
else => {
|
||||
if (isIdentifierChar(c)) {
|
||||
result.tag = .invalid;
|
||||
}
|
||||
'g'...'o', 'q'...'z', 'G'...'O', 'Q'...'Z' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
else => break,
|
||||
},
|
||||
.num_dot_dec => switch (c) {
|
||||
'.' => {
|
||||
@ -1206,12 +1204,11 @@ pub const Tokenizer = struct {
|
||||
result.tag = .float_literal;
|
||||
state = .float_fraction_dec;
|
||||
},
|
||||
else => {
|
||||
if (isIdentifierChar(c)) {
|
||||
result.tag = .invalid;
|
||||
}
|
||||
'_', 'a'...'z', 'A'...'Z' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
else => break,
|
||||
},
|
||||
.num_dot_hex => switch (c) {
|
||||
'.' => {
|
||||
@ -1224,12 +1221,11 @@ pub const Tokenizer = struct {
|
||||
result.tag = .float_literal;
|
||||
state = .float_fraction_hex;
|
||||
},
|
||||
else => {
|
||||
if (isIdentifierChar(c)) {
|
||||
result.tag = .invalid;
|
||||
}
|
||||
'_', 'g'...'z', 'G'...'Z' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
else => break,
|
||||
},
|
||||
.float_fraction_dec_no_underscore => switch (c) {
|
||||
'0'...'9' => {
|
||||
@ -1248,12 +1244,11 @@ pub const Tokenizer = struct {
|
||||
state = .float_exponent_unsigned;
|
||||
},
|
||||
'0'...'9' => {},
|
||||
else => {
|
||||
if (isIdentifierChar(c)) {
|
||||
result.tag = .invalid;
|
||||
}
|
||||
'a'...'d', 'f'...'z', 'A'...'D', 'F'...'Z' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
else => break,
|
||||
},
|
||||
.float_fraction_hex_no_underscore => switch (c) {
|
||||
'0'...'9', 'a'...'f', 'A'...'F' => {
|
||||
@ -1272,12 +1267,11 @@ pub const Tokenizer = struct {
|
||||
state = .float_exponent_unsigned;
|
||||
},
|
||||
'0'...'9', 'a'...'f', 'A'...'F' => {},
|
||||
else => {
|
||||
if (isIdentifierChar(c)) {
|
||||
result.tag = .invalid;
|
||||
}
|
||||
'g'...'o', 'q'...'z', 'G'...'O', 'Q'...'Z' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
else => break,
|
||||
},
|
||||
.float_exponent_unsigned => switch (c) {
|
||||
'+', '-' => {
|
||||
@ -1303,130 +1297,11 @@ pub const Tokenizer = struct {
|
||||
state = .float_exponent_num_no_underscore;
|
||||
},
|
||||
'0'...'9' => {},
|
||||
else => {
|
||||
if (isIdentifierChar(c)) {
|
||||
result.tag = .invalid;
|
||||
}
|
||||
'a'...'z', 'A'...'Z' => {
|
||||
result.tag = .invalid;
|
||||
break;
|
||||
},
|
||||
},
|
||||
}
|
||||
} else if (self.index == self.buffer.len) {
|
||||
switch (state) {
|
||||
.start,
|
||||
.int_literal_dec,
|
||||
.int_literal_bin,
|
||||
.int_literal_oct,
|
||||
.int_literal_hex,
|
||||
.num_dot_dec,
|
||||
.num_dot_hex,
|
||||
.float_fraction_dec,
|
||||
.float_fraction_hex,
|
||||
.float_exponent_num,
|
||||
.string_literal, // find this error later
|
||||
.multiline_string_literal_line,
|
||||
.builtin,
|
||||
.line_comment,
|
||||
.line_comment_start,
|
||||
=> {},
|
||||
|
||||
.identifier => {
|
||||
if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |tag| {
|
||||
result.tag = tag;
|
||||
}
|
||||
},
|
||||
.doc_comment, .doc_comment_start => {
|
||||
result.tag = .doc_comment;
|
||||
},
|
||||
.container_doc_comment => {
|
||||
result.tag = .container_doc_comment;
|
||||
},
|
||||
|
||||
.int_literal_dec_no_underscore,
|
||||
.int_literal_bin_no_underscore,
|
||||
.int_literal_oct_no_underscore,
|
||||
.int_literal_hex_no_underscore,
|
||||
.float_fraction_dec_no_underscore,
|
||||
.float_fraction_hex_no_underscore,
|
||||
.float_exponent_num_no_underscore,
|
||||
.float_exponent_unsigned,
|
||||
.saw_at_sign,
|
||||
.backslash,
|
||||
.char_literal,
|
||||
.char_literal_backslash,
|
||||
.char_literal_hex_escape,
|
||||
.char_literal_unicode_escape_saw_u,
|
||||
.char_literal_unicode_escape,
|
||||
.char_literal_unicode_invalid,
|
||||
.char_literal_end,
|
||||
.char_literal_unicode,
|
||||
.string_literal_backslash,
|
||||
=> {
|
||||
result.tag = .invalid;
|
||||
},
|
||||
|
||||
.equal => {
|
||||
result.tag = .equal;
|
||||
},
|
||||
.bang => {
|
||||
result.tag = .bang;
|
||||
},
|
||||
.minus => {
|
||||
result.tag = .minus;
|
||||
},
|
||||
.slash => {
|
||||
result.tag = .slash;
|
||||
},
|
||||
.zero => {
|
||||
result.tag = .integer_literal;
|
||||
},
|
||||
.ampersand => {
|
||||
result.tag = .ampersand;
|
||||
},
|
||||
.period => {
|
||||
result.tag = .period;
|
||||
},
|
||||
.period_2 => {
|
||||
result.tag = .ellipsis2;
|
||||
},
|
||||
.period_asterisk => {
|
||||
result.tag = .period_asterisk;
|
||||
},
|
||||
.pipe => {
|
||||
result.tag = .pipe;
|
||||
},
|
||||
.angle_bracket_angle_bracket_right => {
|
||||
result.tag = .angle_bracket_angle_bracket_right;
|
||||
},
|
||||
.angle_bracket_right => {
|
||||
result.tag = .angle_bracket_right;
|
||||
},
|
||||
.angle_bracket_angle_bracket_left => {
|
||||
result.tag = .angle_bracket_angle_bracket_left;
|
||||
},
|
||||
.angle_bracket_left => {
|
||||
result.tag = .angle_bracket_left;
|
||||
},
|
||||
.plus_percent => {
|
||||
result.tag = .plus_percent;
|
||||
},
|
||||
.plus => {
|
||||
result.tag = .plus;
|
||||
},
|
||||
.percent => {
|
||||
result.tag = .percent;
|
||||
},
|
||||
.caret => {
|
||||
result.tag = .caret;
|
||||
},
|
||||
.asterisk_percent => {
|
||||
result.tag = .asterisk_percent;
|
||||
},
|
||||
.asterisk => {
|
||||
result.tag = .asterisk;
|
||||
},
|
||||
.minus_percent => {
|
||||
result.tag = .minus_percent;
|
||||
else => break,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@ -1541,7 +1541,7 @@ pub fn create(gpa: *Allocator, options: InitOptions) !*Compilation {
|
||||
}
|
||||
}
|
||||
|
||||
if (comp.bin_file.options.use_stage1) {
|
||||
if (comp.bin_file.options.use_stage1 and comp.bin_file.options.module != null) {
|
||||
try comp.work_queue.writeItem(.{ .stage1_module = {} });
|
||||
}
|
||||
|
||||
|
||||
@ -754,7 +754,7 @@ pub fn render(gpa: *Allocator, nodes: []const Node) !std.zig.ast.Tree {
|
||||
});
|
||||
|
||||
return std.zig.ast.Tree{
|
||||
.source = ctx.buf.toOwnedSlice(),
|
||||
.source = try ctx.buf.toOwnedSliceSentinel(0),
|
||||
.tokens = ctx.tokens.toOwnedSlice(),
|
||||
.nodes = ctx.nodes.toOwnedSlice(),
|
||||
.extra_data = ctx.extra_data.toOwnedSlice(gpa),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user