stage2: tokenizer: require null terminated source

By requiring the source file to be null-terminated, we avoid extra
branching while simplifying the logic at the same time.

Running ast-check on a large zig source file (udivmodti4_test.zig),
master branch compared to this commit:
 * 4% faster wall clock
 * 7% fewer cache misses
 * 1% fewer branches
This commit is contained in:
Andrew Kelley 2021-06-30 18:03:54 -07:00
parent 8ce880ca75
commit 3f680abbe2
6 changed files with 56 additions and 181 deletions

View File

@ -20,7 +20,7 @@ pub const NodeList = std.MultiArrayList(Node);
pub const Tree = struct {
/// Reference to externally-owned data.
source: []const u8,
source: [:0]const u8,
tokens: TokenList.Slice,
/// The root AST node is assumed to be index 0. Since there can be no

View File

@ -17,7 +17,7 @@ pub const Error = error{ParseError} || Allocator.Error;
/// Result should be freed with tree.deinit() when there are
/// no more references to any of the tokens or nodes.
pub fn parse(gpa: *Allocator, source: []const u8) Allocator.Error!Tree {
pub fn parse(gpa: *Allocator, source: [:0]const u8) Allocator.Error!Tree {
var tokens = ast.TokenList{};
defer tokens.deinit(gpa);

View File

@ -5194,7 +5194,7 @@ const maxInt = std.math.maxInt;
var fixed_buffer_mem: [100 * 1024]u8 = undefined;
fn testParse(source: []const u8, allocator: *mem.Allocator, anything_changed: *bool) ![]u8 {
fn testParse(source: [:0]const u8, allocator: *mem.Allocator, anything_changed: *bool) ![]u8 {
const stderr = io.getStdErr().writer();
var tree = try std.zig.parse(allocator, source);
@ -5222,7 +5222,7 @@ fn testParse(source: []const u8, allocator: *mem.Allocator, anything_changed: *b
anything_changed.* = !mem.eql(u8, formatted, source);
return formatted;
}
fn testTransform(source: []const u8, expected_source: []const u8) !void {
fn testTransform(source: [:0]const u8, expected_source: []const u8) !void {
const needed_alloc_count = x: {
// Try it once with unlimited memory, make sure it works
var fixed_allocator = std.heap.FixedBufferAllocator.init(fixed_buffer_mem[0..]);
@ -5268,13 +5268,13 @@ fn testTransform(source: []const u8, expected_source: []const u8) !void {
}
}
}
fn testCanonical(source: []const u8) !void {
fn testCanonical(source: [:0]const u8) !void {
return testTransform(source, source);
}
const Error = std.zig.ast.Error.Tag;
fn testError(source: []const u8, expected_errors: []const Error) !void {
fn testError(source: [:0]const u8, expected_errors: []const Error) !void {
var tree = try std.zig.parse(std.testing.allocator, source);
defer tree.deinit(std.testing.allocator);

View File

@ -326,7 +326,7 @@ pub const Token = struct {
};
pub const Tokenizer = struct {
buffer: []const u8,
buffer: [:0]const u8,
index: usize,
pending_invalid_token: ?Token,
@ -335,7 +335,7 @@ pub const Tokenizer = struct {
std.debug.warn("{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.start..token.end] });
}
pub fn init(buffer: []const u8) Tokenizer {
pub fn init(buffer: [:0]const u8) Tokenizer {
// Skip the UTF-8 BOM if present
const src_start = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else @as(usize, 0);
return Tokenizer{
@ -373,7 +373,6 @@ pub const Tokenizer = struct {
line_comment,
doc_comment_start,
doc_comment,
container_doc_comment,
zero,
int_literal_dec,
int_literal_dec_no_underscore,
@ -407,10 +406,6 @@ pub const Tokenizer = struct {
saw_at_sign,
};
fn isIdentifierChar(char: u8) bool {
return std.ascii.isAlNum(char) or char == '_';
}
pub fn next(self: *Tokenizer) Token {
if (self.pending_invalid_token) |token| {
self.pending_invalid_token = null;
@ -426,10 +421,11 @@ pub const Tokenizer = struct {
};
var seen_escape_digits: usize = undefined;
var remaining_code_units: usize = undefined;
while (self.index < self.buffer.len) : (self.index += 1) {
while (true) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
.start => switch (c) {
0 => break,
' ', '\n', '\t', '\r' => {
result.loc.start = self.index + 1;
},
@ -705,18 +701,22 @@ pub const Tokenizer = struct {
self.index += 1;
break;
},
'\n', '\r' => break, // Look for this error later.
0, '\n', '\r' => break, // Look for this error later.
else => self.checkLiteralCharacter(),
},
.string_literal_backslash => switch (c) {
'\n', '\r' => break, // Look for this error later.
0, '\n', '\r' => break, // Look for this error later.
else => {
state = .string_literal;
},
},
.char_literal => switch (c) {
0 => {
result.tag = .invalid;
break;
},
'\\' => {
state = .char_literal_backslash;
},
@ -742,7 +742,7 @@ pub const Tokenizer = struct {
},
.char_literal_backslash => switch (c) {
'\n' => {
0, '\n' => {
result.tag = .invalid;
break;
},
@ -834,6 +834,7 @@ pub const Tokenizer = struct {
},
.multiline_string_literal_line => switch (c) {
0 => break,
'\n' => {
self.index += 1;
break;
@ -1025,12 +1026,13 @@ pub const Tokenizer = struct {
},
},
.line_comment_start => switch (c) {
0 => break,
'/' => {
state = .doc_comment_start;
},
'!' => {
result.tag = .container_doc_comment;
state = .container_doc_comment;
state = .doc_comment;
},
'\n' => {
state = .start;
@ -1046,7 +1048,7 @@ pub const Tokenizer = struct {
'/' => {
state = .line_comment;
},
'\n' => {
0, '\n' => {
result.tag = .doc_comment;
break;
},
@ -1061,6 +1063,7 @@ pub const Tokenizer = struct {
},
},
.line_comment => switch (c) {
0 => break,
'\n' => {
state = .start;
result.loc.start = self.index + 1;
@ -1068,8 +1071,8 @@ pub const Tokenizer = struct {
'\t', '\r' => {},
else => self.checkLiteralCharacter(),
},
.doc_comment, .container_doc_comment => switch (c) {
'\n' => break,
.doc_comment => switch (c) {
0, '\n' => break,
'\t', '\r' => {},
else => self.checkLiteralCharacter(),
},
@ -1088,12 +1091,11 @@ pub const Tokenizer = struct {
self.index -= 1;
state = .int_literal_dec;
},
else => {
if (isIdentifierChar(c)) {
result.tag = .invalid;
}
'a', 'c', 'd', 'f'...'n', 'p'...'w', 'y', 'z', 'A'...'D', 'F'...'Z' => {
result.tag = .invalid;
break;
},
else => break,
},
.int_literal_bin_no_underscore => switch (c) {
'0'...'1' => {
@ -1109,12 +1111,11 @@ pub const Tokenizer = struct {
state = .int_literal_bin_no_underscore;
},
'0'...'1' => {},
else => {
if (isIdentifierChar(c)) {
result.tag = .invalid;
}
'2'...'9', 'a'...'z', 'A'...'Z' => {
result.tag = .invalid;
break;
},
else => break,
},
.int_literal_oct_no_underscore => switch (c) {
'0'...'7' => {
@ -1130,12 +1131,11 @@ pub const Tokenizer = struct {
state = .int_literal_oct_no_underscore;
},
'0'...'7' => {},
else => {
if (isIdentifierChar(c)) {
result.tag = .invalid;
}
'8', '9', 'a'...'z', 'A'...'Z' => {
result.tag = .invalid;
break;
},
else => break,
},
.int_literal_dec_no_underscore => switch (c) {
'0'...'9' => {
@ -1159,12 +1159,11 @@ pub const Tokenizer = struct {
result.tag = .float_literal;
},
'0'...'9' => {},
else => {
if (isIdentifierChar(c)) {
result.tag = .invalid;
}
'a'...'d', 'f'...'z', 'A'...'D', 'F'...'Z' => {
result.tag = .invalid;
break;
},
else => break,
},
.int_literal_hex_no_underscore => switch (c) {
'0'...'9', 'a'...'f', 'A'...'F' => {
@ -1188,12 +1187,11 @@ pub const Tokenizer = struct {
result.tag = .float_literal;
},
'0'...'9', 'a'...'f', 'A'...'F' => {},
else => {
if (isIdentifierChar(c)) {
result.tag = .invalid;
}
'g'...'o', 'q'...'z', 'G'...'O', 'Q'...'Z' => {
result.tag = .invalid;
break;
},
else => break,
},
.num_dot_dec => switch (c) {
'.' => {
@ -1206,12 +1204,11 @@ pub const Tokenizer = struct {
result.tag = .float_literal;
state = .float_fraction_dec;
},
else => {
if (isIdentifierChar(c)) {
result.tag = .invalid;
}
'_', 'a'...'z', 'A'...'Z' => {
result.tag = .invalid;
break;
},
else => break,
},
.num_dot_hex => switch (c) {
'.' => {
@ -1224,12 +1221,11 @@ pub const Tokenizer = struct {
result.tag = .float_literal;
state = .float_fraction_hex;
},
else => {
if (isIdentifierChar(c)) {
result.tag = .invalid;
}
'_', 'g'...'z', 'G'...'Z' => {
result.tag = .invalid;
break;
},
else => break,
},
.float_fraction_dec_no_underscore => switch (c) {
'0'...'9' => {
@ -1248,12 +1244,11 @@ pub const Tokenizer = struct {
state = .float_exponent_unsigned;
},
'0'...'9' => {},
else => {
if (isIdentifierChar(c)) {
result.tag = .invalid;
}
'a'...'d', 'f'...'z', 'A'...'D', 'F'...'Z' => {
result.tag = .invalid;
break;
},
else => break,
},
.float_fraction_hex_no_underscore => switch (c) {
'0'...'9', 'a'...'f', 'A'...'F' => {
@ -1272,12 +1267,11 @@ pub const Tokenizer = struct {
state = .float_exponent_unsigned;
},
'0'...'9', 'a'...'f', 'A'...'F' => {},
else => {
if (isIdentifierChar(c)) {
result.tag = .invalid;
}
'g'...'o', 'q'...'z', 'G'...'O', 'Q'...'Z' => {
result.tag = .invalid;
break;
},
else => break,
},
.float_exponent_unsigned => switch (c) {
'+', '-' => {
@ -1303,130 +1297,11 @@ pub const Tokenizer = struct {
state = .float_exponent_num_no_underscore;
},
'0'...'9' => {},
else => {
if (isIdentifierChar(c)) {
result.tag = .invalid;
}
'a'...'z', 'A'...'Z' => {
result.tag = .invalid;
break;
},
},
}
} else if (self.index == self.buffer.len) {
switch (state) {
.start,
.int_literal_dec,
.int_literal_bin,
.int_literal_oct,
.int_literal_hex,
.num_dot_dec,
.num_dot_hex,
.float_fraction_dec,
.float_fraction_hex,
.float_exponent_num,
.string_literal, // find this error later
.multiline_string_literal_line,
.builtin,
.line_comment,
.line_comment_start,
=> {},
.identifier => {
if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |tag| {
result.tag = tag;
}
},
.doc_comment, .doc_comment_start => {
result.tag = .doc_comment;
},
.container_doc_comment => {
result.tag = .container_doc_comment;
},
.int_literal_dec_no_underscore,
.int_literal_bin_no_underscore,
.int_literal_oct_no_underscore,
.int_literal_hex_no_underscore,
.float_fraction_dec_no_underscore,
.float_fraction_hex_no_underscore,
.float_exponent_num_no_underscore,
.float_exponent_unsigned,
.saw_at_sign,
.backslash,
.char_literal,
.char_literal_backslash,
.char_literal_hex_escape,
.char_literal_unicode_escape_saw_u,
.char_literal_unicode_escape,
.char_literal_unicode_invalid,
.char_literal_end,
.char_literal_unicode,
.string_literal_backslash,
=> {
result.tag = .invalid;
},
.equal => {
result.tag = .equal;
},
.bang => {
result.tag = .bang;
},
.minus => {
result.tag = .minus;
},
.slash => {
result.tag = .slash;
},
.zero => {
result.tag = .integer_literal;
},
.ampersand => {
result.tag = .ampersand;
},
.period => {
result.tag = .period;
},
.period_2 => {
result.tag = .ellipsis2;
},
.period_asterisk => {
result.tag = .period_asterisk;
},
.pipe => {
result.tag = .pipe;
},
.angle_bracket_angle_bracket_right => {
result.tag = .angle_bracket_angle_bracket_right;
},
.angle_bracket_right => {
result.tag = .angle_bracket_right;
},
.angle_bracket_angle_bracket_left => {
result.tag = .angle_bracket_angle_bracket_left;
},
.angle_bracket_left => {
result.tag = .angle_bracket_left;
},
.plus_percent => {
result.tag = .plus_percent;
},
.plus => {
result.tag = .plus;
},
.percent => {
result.tag = .percent;
},
.caret => {
result.tag = .caret;
},
.asterisk_percent => {
result.tag = .asterisk_percent;
},
.asterisk => {
result.tag = .asterisk;
},
.minus_percent => {
result.tag = .minus_percent;
else => break,
},
}
}

View File

@ -1541,7 +1541,7 @@ pub fn create(gpa: *Allocator, options: InitOptions) !*Compilation {
}
}
if (comp.bin_file.options.use_stage1) {
if (comp.bin_file.options.use_stage1 and comp.bin_file.options.module != null) {
try comp.work_queue.writeItem(.{ .stage1_module = {} });
}

View File

@ -754,7 +754,7 @@ pub fn render(gpa: *Allocator, nodes: []const Node) !std.zig.ast.Tree {
});
return std.zig.ast.Tree{
.source = ctx.buf.toOwnedSlice(),
.source = try ctx.buf.toOwnedSliceSentinel(0),
.tokens = ctx.tokens.toOwnedSlice(),
.nodes = ctx.nodes.toOwnedSlice(),
.extra_data = ctx.extra_data.toOwnedSlice(gpa),