From 1b24f4c73c7f61d123eb8f0c4e90b267d77ec596 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Mon, 23 Nov 2015 21:30:12 -0700 Subject: [PATCH] parsing hello.zig example with recursive descent that was easy --- README.md | 28 +++++ src/grammar.txt | 57 --------- src/main.cpp | 6 +- src/parser.cpp | 311 ++++++++++++++++++++++++++++++++++++++++++++-- src/parser.hpp | 50 ++++++-- src/tokenizer.hpp | 25 ---- src/util.hpp | 6 + 7 files changed, 379 insertions(+), 104 deletions(-) delete mode 100644 src/grammar.txt diff --git a/README.md b/README.md index 56f99b784d..df6e0db00f 100644 --- a/README.md +++ b/README.md @@ -69,3 +69,31 @@ zig | C equivalent | Description f128 | long double | 128-bit IEE754 floating point isize | ssize_t | signed pointer sized integer usize | size_t | unsigned pointer sized integer + +### Grammar + +``` +Root : many(FnDecl) token(EOF); + +FnDecl : token(Fn) token(Symbol) ParamDeclList option(token(Arrow) Type) Block; + +ParamDeclList : token(LParen) list(ParamDecl, token(Comma)) token(RParen); + +ParamDecl : token(Symbol) token(Colon) Type; + +Type : token(Symbol) | PointerType; + +PointerType : token(Star) token(Const) Type | token(Star) token(Mut) Type; + +Block : token(LBrace) many(Statement) token(RBrace); + +Statement : ExpressionStatement | ReturnStatement ; + +ExpressionStatement : Expression token(Semicolon) ; + +ReturnStatement : token(Return) Expression token(Semicolon) ; + +Expression : token(Number) | token(String) | FnCall ; + +FnCall : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) ; +``` diff --git a/src/grammar.txt b/src/grammar.txt deleted file mode 100644 index 015adfe0f4..0000000000 --- a/src/grammar.txt +++ /dev/null @@ -1,57 +0,0 @@ -Root : many(FnDecl) token(EOF) { - $$ = ast_create_root($1); -}; - -FnDecl : token(Fn) token(Symbol) token(LParen) list(ParamDecl, token(Comma)) token(RParen) token(Arrow) Type Block { - $$ = ast_create_fn_decl($2, $4, $7, $8); -} | token(Fn) token(Symbol) token(LParen) list(ParamDecl, token(Comma)) token(RParen) Block { - $$ = ast_create_void_fn_decl($2, $4, $6); -}; - -ParamDecl : token(Symbol) token(Colon) Type { - $$ = ast_create_param_decl($1, $2); -}; - -Type : token(Symbol) { - $$ = ast_create_symbol_type($1); -} | PointerType { - $$ = $1; -}; - -PointerType : token(Star) token(Const) Type { - $$ = ast_create_pointer_type($2, $3); -} | token(Star) token(Mut) Type { - $$ = ast_create_pointer_type($2, $3); -}; - -Block : token(LBrace) many(Statement) Expression token(RBrace) { - $$ = ast_create_expr_block($2, $3); -} | token(LBrace) many(Statement) token(RBrace) { - $$ = ast_create_block($2); -}; - -Statement : ExpressionStatement { - $$ = $1; -} | ReturnStatement { - $$ = $1; -}; - -ExpressionStatement : Expression token(Semicolon) { - $$ = ast_create_expression_statement($1); -}; - -ReturnStatement : token(Return) Expression token(Semicolon) { - $$ = ast_create_return_statement($2); -}; - -Expression : token(Number) { - $$ = ast_create_number($1); -} | token(String) { - $$ = ast_create_string($1); -} | FnCall { - $$ = $1; -}; - -FnCall : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) { - $$ = ast_create_fn_call($1, $3); -}; diff --git a/src/main.cpp b/src/main.cpp index 1f28ce41bc..ff7deadd80 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -89,6 +89,8 @@ static int build(const char *arg0, const char *in_file, const char *out_file, Zi AstNode *root = ast_parse(in_data, tokens); assert(root); + fprintf(stderr, "\nAST:\n"); + fprintf(stderr, "------\n"); ast_print(root, 0); @@ -135,7 +137,7 @@ int main(int argc, char **argv) { } else { switch (cmd) { case CmdNone: - zig_panic("unreachable"); + zig_unreachable(); case CmdBuild: if (!in_file) { in_file = arg; @@ -154,6 +156,6 @@ int main(int argc, char **argv) { return build(arg0, in_file, out_file, &include_paths); } - zig_panic("unreachable"); + zig_unreachable(); } diff --git a/src/parser.cpp b/src/parser.cpp index a89a9b775b..0b887ba0c6 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -41,7 +41,7 @@ const char *node_type_str(NodeType node_type) { case NodeTypeFnCall: return "FnCall"; } - zig_panic("unreachable"); + zig_unreachable(); } void ast_print(AstNode *node, int indent) { @@ -82,28 +82,321 @@ void ast_print(AstNode *node, int indent) { struct ParseContext { Buf *buf; AstNode *root; + ZigList *tokens; }; -AstNode *ast_create_root(void) { - zig_panic("TODO create root"); +static AstNode *ast_create_node(NodeType type) { + AstNode *node = allocate(1); + node->type = type; + return node; } -void ast_invalid_token_error(Buf *buf, Token *token) { +static void ast_buf_from_token(ParseContext *pc, Token *token, Buf *buf) { + buf_init_from_mem(buf, buf_ptr(pc->buf) + token->start_pos, token->end_pos - token->start_pos); +} + +static void ast_invalid_token_error(ParseContext *pc, Token *token) { Buf token_value = {0}; - buf_init_from_mem(&token_value, buf_ptr(buf) + token->start_pos, token->end_pos - token->start_pos); + ast_buf_from_token(pc, token, &token_value); ast_error(token, "invalid token: '%s'", buf_ptr(&token_value)); } -void ast_parse_fn_decls(ParseContext *pc, ZigList *fn_decls) { - zig_panic("TODO parse fn decls"); +static AstNode *ast_parse_expression(ParseContext *pc, int token_index, int *new_token_index); + + +static void ast_expect_token(ParseContext *pc, Token *token, TokenId token_id) { + if (token->id != token_id) { + ast_invalid_token_error(pc, token); + } +} + +/* +Type : token(Symbol) | PointerType; +PointerType : token(Star) token(Const) Type | token(Star) token(Mut) Type; +*/ +static AstNode *ast_parse_type(ParseContext *pc, int token_index, int *new_token_index) { + AstNode *node = ast_create_node(NodeTypeType); + + Token *token = &pc->tokens->at(token_index); + token_index += 1; + + if (token->id == TokenIdSymbol) { + node->data.type.type = AstNodeTypeTypePrimitive; + ast_buf_from_token(pc, token, &node->data.type.primitive_name); + } else if (token->id == TokenIdStar) { + Token *const_or_mut = &pc->tokens->at(token_index); + token_index += 1; + if (const_or_mut->id == TokenIdKeywordMut) { + node->data.type.is_const = false; + } else if (const_or_mut->id == TokenIdKeywordConst) { + node->data.type.is_const = true; + } else { + ast_invalid_token_error(pc, const_or_mut); + } + + node->data.type.child_type = ast_parse_type(pc, token_index, &token_index); + } else { + ast_invalid_token_error(pc, token); + } + + *new_token_index = token_index; + return node; +} + +/* +ParamDecl : token(Symbol) token(Colon) Type { +}; +*/ +static AstNode *ast_parse_param_decl(ParseContext *pc, int token_index, int *new_token_index) { + AstNode *node = ast_create_node(NodeTypeParamDecl); + + Token *param_name = &pc->tokens->at(token_index); + token_index += 1; + ast_expect_token(pc, param_name, TokenIdSymbol); + + ast_buf_from_token(pc, param_name, &node->data.param_decl.name); + + Token *colon = &pc->tokens->at(token_index); + token_index += 1; + ast_expect_token(pc, colon, TokenIdColon); + + node->data.param_decl.type = ast_parse_type(pc, token_index, &token_index); + + *new_token_index = token_index; + return node; +} + + +static void ast_parse_param_decl_list(ParseContext *pc, int token_index, int *new_token_index, + ZigList *params) +{ + Token *l_paren = &pc->tokens->at(token_index); + token_index += 1; + ast_expect_token(pc, l_paren, TokenIdLParen); + + Token *token = &pc->tokens->at(token_index); + if (token->id == TokenIdRParen) { + token_index += 1; + *new_token_index = token_index; + return; + } + + for (;;) { + AstNode *param_decl_node = ast_parse_param_decl(pc, token_index, &token_index); + params->append(param_decl_node); + + Token *token = &pc->tokens->at(token_index); + token_index += 1; + if (token->id == TokenIdRParen) { + *new_token_index = token_index; + return; + } else { + ast_expect_token(pc, token, TokenIdComma); + } + } + zig_unreachable(); +} + +static void ast_parse_fn_call_param_list(ParseContext *pc, int token_index, int *new_token_index, + ZigList *params) +{ + Token *l_paren = &pc->tokens->at(token_index); + token_index += 1; + ast_expect_token(pc, l_paren, TokenIdLParen); + + Token *token = &pc->tokens->at(token_index); + if (token->id == TokenIdRParen) { + token_index += 1; + *new_token_index = token_index; + return; + } + + for (;;) { + AstNode *expr = ast_parse_expression(pc, token_index, &token_index); + params->append(expr); + + Token *token = &pc->tokens->at(token_index); + token_index += 1; + if (token->id == TokenIdRParen) { + *new_token_index = token_index; + return; + } else { + ast_expect_token(pc, token, TokenIdComma); + } + } + zig_unreachable(); +} + +/* +FnCall : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) ; +*/ +static AstNode *ast_parse_fn_call(ParseContext *pc, int token_index, int *new_token_index) { + AstNode *node = ast_create_node(NodeTypeFnCall); + + Token *fn_name = &pc->tokens->at(token_index); + token_index += 1; + ast_expect_token(pc, fn_name, TokenIdSymbol); + + ast_buf_from_token(pc, fn_name, &node->data.fn_call.name); + + ast_parse_fn_call_param_list(pc, token_index, &token_index, &node->data.fn_call.params); + + *new_token_index = token_index; + return node; +} + +static AstNode *ast_parse_expression(ParseContext *pc, int token_index, int *new_token_index) { + AstNode *node = ast_create_node(NodeTypeExpression); + + Token *token = &pc->tokens->at(token_index); + if (token->id == TokenIdSymbol) { + node->data.expression.type = AstNodeExpressionTypeFnCall; + node->data.expression.data.fn_call = ast_parse_fn_call(pc, token_index, &token_index); + } else if (token->id == TokenIdNumberLiteral) { + node->data.expression.type = AstNodeExpressionTypeNumber; + ast_buf_from_token(pc, token, &node->data.expression.data.number); + token_index += 1; + } else if (token->id == TokenIdStringLiteral) { + node->data.expression.type = AstNodeExpressionTypeString; + ast_buf_from_token(pc, token, &node->data.expression.data.string); + token_index += 1; + } else { + ast_invalid_token_error(pc, token); + } + + *new_token_index = token_index; + return node; +} + +/* +Statement : ExpressionStatement | ReturnStatement ; + +ExpressionStatement : Expression token(Semicolon) ; + +ReturnStatement : token(Return) Expression token(Semicolon) ; + +Expression : token(Number) | token(String) | FnCall ; + +FnCall : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) ; +*/ +static AstNode *ast_parse_statement(ParseContext *pc, int token_index, int *new_token_index) { + AstNode *node = ast_create_node(NodeTypeStatement); + + Token *token = &pc->tokens->at(token_index); + if (token->id == TokenIdKeywordReturn) { + token_index += 1; + node->data.statement.type = AstNodeStatementTypeReturn; + node->data.statement.data.retrn.expression = ast_parse_expression(pc, token_index, &token_index); + + Token *semicolon = &pc->tokens->at(token_index); + token_index += 1; + ast_expect_token(pc, semicolon, TokenIdSemicolon); + } else if (token->id == TokenIdSymbol || + token->id == TokenIdStringLiteral || + token->id == TokenIdNumberLiteral) + { + node->data.statement.type = AstNodeStatementTypeExpression; + node->data.statement.data.expr.expression = ast_parse_expression(pc, token_index, &token_index); + + Token *semicolon = &pc->tokens->at(token_index); + token_index += 1; + ast_expect_token(pc, semicolon, TokenIdSemicolon); + } else { + ast_invalid_token_error(pc, token); + } + + *new_token_index = token_index; + return node; +} + +/* +Block : token(LBrace) many(Statement) token(RBrace); +*/ +static AstNode *ast_parse_block(ParseContext *pc, int token_index, int *new_token_index) { + AstNode *node = ast_create_node(NodeTypeBlock); + + Token *l_brace = &pc->tokens->at(token_index); + token_index += 1; + ast_expect_token(pc, l_brace, TokenIdLBrace); + + for (;;) { + Token *token = &pc->tokens->at(token_index); + if (token->id == TokenIdRBrace) { + token_index += 1; + *new_token_index = token_index; + return node; + } else { + AstNode *statement_node = ast_parse_statement(pc, token_index, &token_index); + node->data.block.statements.append(statement_node); + } + } + zig_unreachable(); +} + +/* +FnDecl : token(Fn) token(Symbol) ParamDeclList option(token(Arrow) Type) Block; +*/ +static AstNode *ast_parse_fn_decl(ParseContext *pc, int token_index, int *new_token_index) { + AstNode *node = ast_create_node(NodeTypeFnDecl); + + Token *fn_token = &pc->tokens->at(token_index); + token_index += 1; + ast_expect_token(pc, fn_token, TokenIdKeywordFn); + + Token *fn_name = &pc->tokens->at(token_index); + token_index += 1; + ast_expect_token(pc, fn_name, TokenIdSymbol); + + ast_buf_from_token(pc, fn_name, &node->data.fn_decl.name); + + + ast_parse_param_decl_list(pc, token_index, &token_index, &node->data.fn_decl.params); + + Token *arrow = &pc->tokens->at(token_index); + token_index += 1; + if (arrow->id == TokenIdArrow) { + node->data.fn_decl.return_type = ast_parse_type(pc, token_index, &token_index); + } else if (arrow->id == TokenIdLBrace) { + node->data.fn_decl.return_type = nullptr; + } else { + ast_invalid_token_error(pc, arrow); + } + + node->data.fn_decl.body = ast_parse_block(pc, token_index, &token_index); + + *new_token_index = token_index; + return node; +} + + +static void ast_parse_fn_decl_list(ParseContext *pc, int token_index, ZigList *fn_decls, + int *new_token_index) +{ + for (;;) { + Token *token = &pc->tokens->at(token_index); + if (token->id == TokenIdKeywordFn) { + AstNode *fn_decl_node = ast_parse_fn_decl(pc, token_index, &token_index); + fn_decls->append(fn_decl_node); + } else { + *new_token_index = token_index; + return; + } + } + zig_unreachable(); } AstNode *ast_parse(Buf *buf, ZigList *tokens) { ParseContext pc = {0}; pc.buf = buf; - pc.root = ast_create_root(); + pc.root = ast_create_node(NodeTypeRoot); + pc.tokens = tokens; - ast_parse_fn_decls(&pc, &pc.root->data.root.fn_decls); + int new_token_index; + ast_parse_fn_decl_list(&pc, 0, &pc.root->data.root.fn_decls, &new_token_index); + + if (new_token_index != tokens->length - 1) { + ast_invalid_token_error(&pc, &tokens->at(new_token_index)); + } return pc.root; } diff --git a/src/parser.hpp b/src/parser.hpp index b00c6211f7..2a46f76698 100644 --- a/src/parser.hpp +++ b/src/parser.hpp @@ -44,20 +44,49 @@ enum AstNodeTypeType { struct AstNodeType { AstNodeTypeType type; - AstNode *child; -}; - -struct AstNodePointerType { - AstNode *const_or_mut; - AstNode *type; + Buf primitive_name; + AstNode *child_type; + bool is_const; }; struct AstNodeBlock { - ZigList expressions; + ZigList statements; +}; + +enum AstNodeStatementType { + AstNodeStatementTypeExpression, + AstNodeStatementTypeReturn, +}; + +struct AstNodeStatementExpression { + AstNode *expression; +}; + +struct AstNodeStatementReturn { + AstNode *expression; +}; + +struct AstNodeStatement { + AstNodeStatementType type; + union { + AstNodeStatementExpression expr; + AstNodeStatementReturn retrn; + } data; +}; + +enum AstNodeExpressionType { + AstNodeExpressionTypeNumber, + AstNodeExpressionTypeString, + AstNodeExpressionTypeFnCall, }; struct AstNodeExpression { - AstNode *child; + AstNodeExpressionType type; + union { + Buf number; + Buf string; + AstNode *fn_call; + } data; }; struct AstNodeFnCall { @@ -74,13 +103,14 @@ struct AstNode { AstNodeType type; AstNodeParamDecl param_decl; AstNodeBlock block; + AstNodeStatement statement; AstNodeExpression expression; AstNodeFnCall fn_call; } data; }; __attribute__ ((format (printf, 2, 3))) -void ast_error(Token *token, const char *format, ...); +void ast_token_error(Token *token, const char *format, ...); void ast_invalid_token_error(Buf *buf, Token *token); @@ -91,6 +121,4 @@ const char *node_type_str(NodeType node_type); void ast_print(AstNode *node, int indent); -AstNode *ast_create_root(void); - #endif diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp index cd9c1e1325..195db67504 100644 --- a/src/tokenizer.hpp +++ b/src/tokenizer.hpp @@ -10,7 +10,6 @@ #include "buffer.hpp" -/* enum TokenId { TokenIdEof, TokenIdSymbol, @@ -32,30 +31,6 @@ enum TokenId { TokenIdArrow, TokenIdDash, }; -*/ - -// TODO: debug delete this -enum TokenId { - TokenIdLParen = 0, - TokenIdRParen = 1, - TokenIdEof = 2, - TokenIdStar = 3, - TokenIdPlus = 4, - TokenIdSymbol, - TokenIdKeywordFn, - TokenIdKeywordReturn, - TokenIdKeywordMut, - TokenIdKeywordConst, - TokenIdComma, - TokenIdLBrace, - TokenIdRBrace, - TokenIdStringLiteral, - TokenIdSemicolon, - TokenIdNumberLiteral, - TokenIdColon, - TokenIdArrow, - TokenIdDash, -}; struct Token { TokenId id; diff --git a/src/util.hpp b/src/util.hpp index 2c083b6892..1322b6aec2 100644 --- a/src/util.hpp +++ b/src/util.hpp @@ -20,6 +20,12 @@ void zig_panic(const char *format, ...) __attribute__ ((noreturn)) __attribute__ ((format (printf, 1, 2))); +__attribute__((cold)) +__attribute__ ((noreturn)) +static inline void zig_unreachable(void) { + zig_panic("unreachable"); +} + template __attribute__((malloc)) static inline T *allocate_nonzero(size_t count) { T *ptr = reinterpret_cast(malloc(count * sizeof(T)));