diff --git a/lib/std/zig/parse.zig b/lib/std/zig/parse.zig index 5ec80a8390..3bb27975db 100644 --- a/lib/std/zig/parse.zig +++ b/lib/std/zig/parse.zig @@ -50,22 +50,7 @@ pub fn parse(gpa: Allocator, source: [:0]const u8) Allocator.Error!Ast { const estimated_node_count = (tokens.len + 2) / 2; try parser.nodes.ensureTotalCapacity(gpa, estimated_node_count); - // Root node must be index 0. - // Root <- skip ContainerMembers eof - parser.nodes.appendAssumeCapacity(.{ - .tag = .root, - .main_token = 0, - .data = undefined, - }); - const root_members = try parser.parseContainerMembers(); - const root_decls = try root_members.toSpan(&parser); - if (parser.token_tags[parser.tok_i] != .eof) { - try parser.warnExpected(.eof); - } - parser.nodes.items(.data)[0] = .{ - .lhs = root_decls.start, - .rhs = root_decls.end, - }; + try parser.parseRoot(); // TODO experiment with compacting the MultiArrayList slices here return Ast{ @@ -237,12 +222,33 @@ const Parser = struct { return error.ParseError; } + /// Root <- skip container_doc_comment? ContainerMembers eof + fn parseRoot(p: *Parser) !void { + // Root node must be index 0. + p.nodes.appendAssumeCapacity(.{ + .tag = .root, + .main_token = 0, + .data = undefined, + }); + const root_members = try p.parseContainerMembers(); + const root_decls = try root_members.toSpan(p); + if (p.token_tags[p.tok_i] != .eof) { + try p.warnExpected(.eof); + } + p.nodes.items(.data)[0] = .{ + .lhs = root_decls.start, + .rhs = root_decls.end, + }; + } + /// ContainerMembers <- ContainerDeclarations (ContainerField COMMA)* (ContainerField / ContainerDeclarations) + /// /// ContainerDeclarations /// <- TestDecl ContainerDeclarations /// / ComptimeDecl ContainerDeclarations /// / doc_comment? KEYWORD_pub? Decl ContainerDeclarations /// / + /// /// ComptimeDecl <- KEYWORD_comptime Block fn parseContainerMembers(p: *Parser) !Members { const scratch_top = p.scratch.items.len; @@ -887,7 +893,9 @@ const Parser = struct { } } - /// ContainerField <- KEYWORD_comptime? IDENTIFIER (COLON TypeExpr ByteAlign?)? (EQUAL Expr)? + /// ContainerField + /// <- doc_comment? KEYWORD_comptime? IDENTIFIER (COLON TypeExpr)? ByteAlign? (EQUAL Expr)? + /// / doc_comment? KEYWORD_comptime? (IDENTIFIER COLON)? !KEYWORD_fn TypeExpr ByteAlign? (EQUAL Expr)? fn expectContainerField(p: *Parser) !Node.Index { var main_token = p.tok_i; _ = p.eatToken(.keyword_comptime); @@ -1173,6 +1181,7 @@ const Parser = struct { } /// ForPrefix <- KEYWORD_for LPAREN Expr RPAREN PtrIndexPayload + /// /// ForStatement /// <- ForPrefix BlockExpr ( KEYWORD_else Statement )? /// / ForPrefix AssignExpr ( SEMICOLON / KEYWORD_else Statement ) @@ -1234,6 +1243,7 @@ const Parser = struct { } /// WhilePrefix <- KEYWORD_while LPAREN Expr RPAREN PtrPayload? WhileContinueExpr? + /// /// WhileStatement /// <- WhilePrefix BlockExpr ( KEYWORD_else Payload? Statement )? /// / WhilePrefix AssignExpr ( SEMICOLON / KEYWORD_else Payload? Statement ) @@ -1368,13 +1378,18 @@ const Parser = struct { } /// AssignExpr <- Expr (AssignOp Expr)? + /// /// AssignOp /// <- ASTERISKEQUAL + /// / ASTERISKPIPEEQUAL /// / SLASHEQUAL /// / PERCENTEQUAL /// / PLUSEQUAL + /// / PLUSPIPEEQUAL /// / MINUSEQUAL + /// / MINUSPIPEEQUAL /// / LARROW2EQUAL + /// / LARROW2PIPEEQUAL /// / RARROW2EQUAL /// / AMPERSANDEQUAL /// / CARETEQUAL @@ -1553,6 +1568,7 @@ const Parser = struct { } /// PrefixExpr <- PrefixOp* PrimaryExpr + /// /// PrefixOp /// <- EXCLAMATIONMARK /// / MINUS @@ -1591,17 +1607,21 @@ const Parser = struct { } /// TypeExpr <- PrefixTypeOp* ErrorUnionExpr + /// /// PrefixTypeOp /// <- QUESTIONMARK /// / KEYWORD_anyframe MINUSRARROW /// / SliceTypeStart (ByteAlign / AddrSpace / KEYWORD_const / KEYWORD_volatile / KEYWORD_allowzero)* /// / PtrTypeStart (AddrSpace / KEYWORD_align LPAREN Expr (COLON Expr COLON Expr)? RPAREN / KEYWORD_const / KEYWORD_volatile / KEYWORD_allowzero)* /// / ArrayTypeStart + /// /// SliceTypeStart <- LBRACKET (COLON Expr)? RBRACKET + /// /// PtrTypeStart /// <- ASTERISK /// / ASTERISK2 /// / LBRACKET ASTERISK (LETTERC / COLON Expr)? RBRACKET + /// /// ArrayTypeStart <- LBRACKET Expr (COLON Expr)? RBRACKET fn parseTypeExpr(p: *Parser) Error!Node.Index { switch (p.token_tags[p.tok_i]) { @@ -2068,6 +2088,7 @@ const Parser = struct { } /// ForPrefix <- KEYWORD_for LPAREN Expr RPAREN PtrIndexPayload + /// /// ForExpr <- ForPrefix Expr (KEYWORD_else Expr)? fn parseForExpr(p: *Parser) !Node.Index { const for_token = p.eatToken(.keyword_for) orelse return null_node; @@ -2103,6 +2124,7 @@ const Parser = struct { } /// WhilePrefix <- KEYWORD_while LPAREN Expr RPAREN PtrPayload? WhileContinueExpr? + /// /// WhileExpr <- WhilePrefix Expr (KEYWORD_else Payload? Expr)? fn parseWhileExpr(p: *Parser) !Node.Index { const while_token = p.eatToken(.keyword_while) orelse return null_node; @@ -2154,6 +2176,7 @@ const Parser = struct { } /// CurlySuffixExpr <- TypeExpr InitList? + /// /// InitList /// <- LBRACE FieldInit (COMMA FieldInit)* COMMA? RBRACE /// / LBRACE Expr (COMMA Expr)* COMMA? RBRACE @@ -2272,7 +2295,9 @@ const Parser = struct { /// SuffixExpr /// <- KEYWORD_async PrimaryTypeExpr SuffixOp* FnCallArguments /// / PrimaryTypeExpr (SuffixOp / FnCallArguments)* + /// /// FnCallArguments <- LPAREN ExprList RPAREN + /// /// ExprList <- (Expr COMMA)* Expr? fn parseSuffixExpr(p: *Parser) !Node.Index { if (p.eatToken(.keyword_async)) |_| { @@ -2410,18 +2435,26 @@ const Parser = struct { /// / KEYWORD_unreachable /// / STRINGLITERAL /// / SwitchExpr + /// /// ContainerDecl <- (KEYWORD_extern / KEYWORD_packed)? ContainerDeclAuto - /// ContainerDeclAuto <- ContainerDeclType LBRACE ContainerMembers RBRACE + /// + /// ContainerDeclAuto <- ContainerDeclType LBRACE container_doc_comment? ContainerMembers RBRACE + /// /// InitList /// <- LBRACE FieldInit (COMMA FieldInit)* COMMA? RBRACE /// / LBRACE Expr (COMMA Expr)* COMMA? RBRACE /// / LBRACE RBRACE + /// /// ErrorSetDecl <- KEYWORD_error LBRACE IdentifierList RBRACE + /// /// GroupedExpr <- LPAREN Expr RPAREN + /// /// IfTypeExpr <- IfPrefix TypeExpr (KEYWORD_else Payload? TypeExpr)? + /// /// LabeledTypeExpr /// <- BlockLabel Block /// / BlockLabel? LoopTypeExpr + /// /// LoopTypeExpr <- KEYWORD_inline? (ForTypeExpr / WhileTypeExpr) fn parsePrimaryTypeExpr(p: *Parser) !Node.Index { switch (p.token_tags[p.tok_i]) { @@ -2751,6 +2784,7 @@ const Parser = struct { } /// ForPrefix <- KEYWORD_for LPAREN Expr RPAREN PtrIndexPayload + /// /// ForTypeExpr <- ForPrefix TypeExpr (KEYWORD_else TypeExpr)? fn parseForTypeExpr(p: *Parser) !Node.Index { const for_token = p.eatToken(.keyword_for) orelse return null_node; @@ -2786,6 +2820,7 @@ const Parser = struct { } /// WhilePrefix <- KEYWORD_while LPAREN Expr RPAREN PtrPayload? WhileContinueExpr? + /// /// WhileTypeExpr <- WhilePrefix TypeExpr (KEYWORD_else Payload? TypeExpr)? fn parseWhileTypeExpr(p: *Parser) !Node.Index { const while_token = p.eatToken(.keyword_while) orelse return null_node; @@ -2861,11 +2896,17 @@ const Parser = struct { } /// AsmExpr <- KEYWORD_asm KEYWORD_volatile? LPAREN Expr AsmOutput? RPAREN + /// /// AsmOutput <- COLON AsmOutputList AsmInput? + /// /// AsmInput <- COLON AsmInputList AsmClobbers? + /// /// AsmClobbers <- COLON StringList + /// /// StringList <- (STRINGLITERAL COMMA)* STRINGLITERAL? + /// /// AsmOutputList <- (AsmOutputItem COMMA)* AsmOutputItem? + /// /// AsmInputList <- (AsmInputItem COMMA)* AsmInputItem? fn expectAsmExpr(p: *Parser) !Node.Index { const asm_token = p.assertToken(.keyword_asm); @@ -3069,15 +3110,17 @@ const Parser = struct { return expr_node; } - /// ParamDecl - /// <- (KEYWORD_noalias / KEYWORD_comptime)? (IDENTIFIER COLON)? ParamType - /// / DOT3 - /// ParamType - /// <- Keyword_anytype - /// / TypeExpr /// This function can return null nodes and then still return nodes afterwards, /// such as in the case of anytype and `...`. Caller must look for rparen to find /// out when there are no more param decls left. + /// + /// ParamDecl + /// <- doc_comment? (KEYWORD_noalias / KEYWORD_comptime)? (IDENTIFIER COLON)? ParamType + /// / DOT3 + /// + /// ParamType + /// <- KEYWORD_anytype + /// / TypeExpr fn expectParamDecl(p: *Parser) !Node.Index { _ = try p.eatDocComments(); switch (p.token_tags[p.tok_i]) { @@ -3119,8 +3162,9 @@ const Parser = struct { return identifier; } - /// PtrIndexPayload <- PIPE ASTERISK? IDENTIFIER (COMMA IDENTIFIER)? PIPE /// Returns the first identifier token, if any. + /// + /// PtrIndexPayload <- PIPE ASTERISK? IDENTIFIER (COMMA IDENTIFIER)? PIPE fn parsePtrIndexPayload(p: *Parser) !TokenIndex { _ = p.eatToken(.pipe) orelse return @as(TokenIndex, 0); _ = p.eatToken(.asterisk); @@ -3133,6 +3177,7 @@ const Parser = struct { } /// SwitchProng <- KEYWORD_inline? SwitchCase EQUALRARROW PtrIndexPayload? AssignExpr + /// /// SwitchCase /// <- SwitchItem (COMMA SwitchItem)* COMMA? /// / KEYWORD_else @@ -3385,6 +3430,7 @@ const Parser = struct { } /// Caller must have already verified the first token. + /// /// ContainerDeclAuto <- ContainerDeclType LBRACE container_doc_comment? ContainerMembers RBRACE /// /// ContainerDeclType @@ -3556,6 +3602,7 @@ const Parser = struct { } /// Holds temporary data until we are ready to construct the full ContainerDecl AST node. + /// /// ByteAlign <- KEYWORD_align LPAREN Expr RPAREN fn parseByteAlign(p: *Parser) !Node.Index { _ = p.eatToken(.keyword_align) orelse return null_node; @@ -3625,6 +3672,7 @@ const Parser = struct { } /// FnCallArguments <- LPAREN ExprList RPAREN + /// /// ExprList <- (Expr COMMA)* Expr? fn parseBuiltinCall(p: *Parser) !Node.Index { const builtin_token = p.assertToken(.builtin); @@ -3698,7 +3746,7 @@ const Parser = struct { } } - /// KEYWORD_if LPAREN Expr RPAREN PtrPayload? Body (KEYWORD_else Payload? Body)? + /// IfPrefix <- KEYWORD_if LPAREN Expr RPAREN PtrPayload? fn parseIf(p: *Parser, comptime bodyParseFn: fn (p: *Parser) Error!Node.Index) !Node.Index { const if_token = p.eatToken(.keyword_if) orelse return null_node; _ = try p.expectToken(.l_paren); diff --git a/tools/extract-grammar.zig b/tools/extract-grammar.zig new file mode 100644 index 0000000000..8e8a17bacf --- /dev/null +++ b/tools/extract-grammar.zig @@ -0,0 +1,100 @@ +//! Extract the "de facto" Zig Grammar from the parser in lib/std/zig/parse.zig. +//! +//! The generated file must be edited by hand, in order to remove normal doc-comments. + +const std = @import("std"); +const fs = std.fs; +const heap = std.heap; +const io = std.io; +const mem = std.mem; +const process = std.process; +const zig = std.zig; + +const Buffer = struct { + const buf_size = 4096; + + buf: [buf_size]u8 = undefined, + pos: usize = 0, + + pub fn append(self: *Buffer, src: []const u8) !void { + if (self.pos + src.len > buf_size) { + return error.BufferOverflow; + } + + mem.copy(u8, self.buf[self.pos..buf_size], src); + self.pos += src.len; + } + + pub fn reset(self: *Buffer) void { + self.pos = 0; + } + + pub fn slice(self: *Buffer) []const u8 { + return self.buf[0..self.pos]; + } +}; + +/// There are many assumptions in the entire codebase that Zig source files can +/// be byte-indexed with a u32 integer. +const max_src_size = std.math.maxInt(u32); + +var stdout = io.getStdOut().writer(); + +pub fn main() !void { + var arena = heap.ArenaAllocator.init(heap.page_allocator); + defer arena.deinit(); // NOTE(mperillo): Can be removed. + + const allocator = arena.allocator(); + + var args_it = try process.argsWithAllocator(allocator); + _ = args_it.skip(); // it is safe to ignore + + const path = args_it.next() orelse return error.SourceFileRequired; + const src = try read(path, allocator); + + var tokenizer = zig.Tokenizer.init(src); + var buf: Buffer = Buffer{}; + while (true) { + const token = tokenizer.next(); + switch (token.tag) { + .eof => break, + .doc_comment => { + const line = blk: { + // Strip leading whitespace. + const len = token.loc.end - token.loc.start; + break :blk if (len == 3) src[token.loc.start + 3 .. token.loc.end] else src[token.loc.start + 4 .. token.loc.end]; + }; + + try buf.append(line); + try buf.append("\n"); + }, + .keyword_fn => { + const doc = buf.slice(); + buf.reset(); + + // Check if doc contains a PEG grammar block, so that normal + // doc-comments are ignored. + if (mem.indexOf(u8, doc, "<-") != null) { + // Separate each doc with an empty line. This in turn will + // ensure that rules are separate by an empty line. + try stdout.print("{s}\n", .{doc}); + } + }, + else => {}, + } + } +} + +fn read(path: []const u8, allocator: mem.Allocator) ![:0]const u8 { + var f = try fs.cwd().openFile(path, .{ .mode = .read_only }); + defer f.close(); + + const st = try f.stat(); + if (st.size > max_src_size) return error.FileTooBig; + + const src = try allocator.allocSentinel(u8, @intCast(usize, st.size), 0); + const n = try f.readAll(src); + if (n != st.size) return error.UnexpectedEndOfFile; + + return src; +}