parser generator supports sub rules

This commit is contained in:
Andrew Kelley 2015-11-06 23:59:40 -07:00
parent 72be61fc0a
commit ae0e9685c6
3 changed files with 172 additions and 33 deletions

View File

@ -1,46 +1,50 @@
Root : many(FnDecl) token(EOF) {
Root<node> : many(FnDecl) token(EOF) {
$$ = ast_create_root($1);
};
FnDecl : token(Fn) token(Symbol) token(LParen) list(ParamDecl, token(Comma)) token(RParen) option(token(Arrow) Type) Block {
FnDecl<node> : token(Fn) token(Symbol) token(LParen) list(ParamDecl, token(Comma)) token(RParen) option(ReturnType) Block {
$$ = ast_create_fn_decl($2, $4, $6, $7);
};
ParamDecl : token(Symbol) token(Colon) Type {
ReturnType<node> : token(Arrow) Type {
$$ = $2;
};
ParamDecl<node> : token(Symbol) token(Colon) Type {
$$ = ast_create_param_decl($1, $2);
};
Type : token(Symbol) {
Type<node> : token(Symbol) {
$$ = ast_create_symbol_type($1);
} | PointerType {
$$ = $1;
};
PointerType : token(Star) token(Const) Type {
PointerType<node> : token(Star) token(Const) Type {
$$ = ast_create_pointer_type($2, $3);
} | token(Star) token(Mut) Type {
$$ = ast_create_pointer_type($2, $3);
};
Block : token(LBrace) many(Statement) option(Expression) token(RBrace) {
Block<node> : token(LBrace) many(Statement) option(Expression) token(RBrace) {
$$ = ast_create_block($2, $3);
};
Statement : ExpressionStatement {
Statement<node> : ExpressionStatement {
$$ = $1;
} | ReturnStatement {
$$ = $1;
};
ExpressionStatement : Expression token(Semicolon) {
ExpressionStatement<node> : Expression token(Semicolon) {
$$ = ast_create_expression_statement($1);
};
ReturnStatement : token(Return) Expression token(Semicolon) {
ReturnStatement<node> : token(Return) Expression token(Semicolon) {
$$ = ast_create_return_statement($2);
};
Expression : token(Number) {
Expression<node> : token(Number) {
$$ = ast_create_number($1);
} | token(String) {
$$ = ast_create_string($1);
@ -48,6 +52,6 @@ Expression : token(Number) {
$$ = $1;
};
FnCall : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) {
FnCall<node> : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) {
$$ = ast_create_fn_call($1, $3);
};

View File

@ -136,6 +136,7 @@ struct RuleTuple {
Buf name;
ZigList<RuleNode *> children;
Buf body;
Buf union_field_name;
};
struct RuleMany {
@ -161,6 +162,9 @@ struct RuleList {
struct RuleSubRule {
RuleNode *child;
// for lexer use only
Buf name;
};
enum RuleNodeType {
@ -175,6 +179,8 @@ enum RuleNodeType {
struct RuleNode {
RuleNodeType type;
int lex_line;
int lex_column;
union {
RuleTuple tuple;
RuleMany many;
@ -205,6 +211,7 @@ struct CodeGenCapture {
Buf *body;
bool is_root;
Buf *field_names;
Buf *union_field_name;
};
struct CodeGen {
@ -225,6 +232,8 @@ struct ParserState {
enum LexState {
LexStateStart,
LexStateRuleName,
LexStateRuleFieldNameStart,
LexStateRuleFieldName,
LexStateWaitForColon,
LexStateTupleRule,
LexStateFnName,
@ -232,6 +241,7 @@ enum LexState {
LexStateToken,
LexStateBody,
LexStateEndOrOr,
LexStateSubTupleName,
};
struct LexStack {
@ -258,6 +268,8 @@ struct Gen {
int lex_token_name_begin;
int lex_body_begin;
int lex_body_end;
int lex_sub_tuple_begin;
int lex_field_name_begin;
};
static ParserState *create_state(Gen *g) {
@ -303,12 +315,13 @@ static void state_add_push_node(ParserState *state) {
state_add_code(state, code);
}
static CodeGen *codegen_create_capture(Buf *body, bool is_root, int field_name_count) {
static CodeGen *codegen_create_capture(Buf *body, bool is_root, int field_name_count, Buf *union_field_name) {
CodeGen *code = allocate<CodeGen>(1);
code->type = CodeGenTypeCapture;
code->capture.body = body;
code->capture.is_root = is_root;
code->capture.field_names = allocate<Buf>(field_name_count);
code->capture.union_field_name = union_field_name;
return code;
}
@ -325,6 +338,7 @@ static void state_add_eat_token(ParserState *state) {
}
static void gen(Gen *g, RuleNode *node, Buf *out_field_name) {
assert(node);
switch (node->type) {
case RuleNodeTypeToken:
{
@ -346,13 +360,14 @@ static void gen(Gen *g, RuleNode *node, Buf *out_field_name) {
break;
case RuleNodeTypeTuple:
{
buf_init_from_str(out_field_name, "node");
buf_init_from_buf(out_field_name, &node->tuple.union_field_name);
state_add_push_node(g->cur_state);
bool is_root = (node == g->root);
int field_name_count = node->tuple.children.length;
CodeGen *code = codegen_create_capture(&node->tuple.body, is_root, field_name_count);
CodeGen *code = codegen_create_capture(&node->tuple.body, is_root, field_name_count,
&node->tuple.union_field_name);
for (int i = 0; i < node->tuple.children.length; i += 1) {
RuleNode *child = node->tuple.children.at(i);
@ -411,7 +426,7 @@ static void lex_error(Gen *g, const char *format, ...) {
va_list ap;
va_start(ap, format);
fprintf(stderr, "Error: Line %d, column %d: ", line, column);
fprintf(stderr, "Grammar Error: Line %d, column %d: ", line, column);
vfprintf(stderr, format, ap);
fprintf(stderr, "\n");
va_end(ap);
@ -428,10 +443,16 @@ static void lex_pop_stack(Gen *g) {
g->lex_stack.pop();
}
static RuleNode *create_rule_node(Gen *g) {
RuleNode *node = allocate<RuleNode>(1);
node->lex_line = g->lex_line;
node->lex_column = g->lex_column;
return node;
}
static void begin_rule(Gen *g) {
assert(!g->lex_cur_rule);
g->lex_cur_rule = allocate<RuleNode>(1);
g->lex_cur_rule = create_rule_node(g);
g->lex_cur_rule->type = RuleNodeTypeTuple;
g->lex_cur_rule_begin = g->lex_pos;
@ -452,6 +473,18 @@ static void end_rule_name(Gen *g) {
buf_init_from_mem(&g->lex_cur_rule->tuple.name, ptr, len);
}
static void begin_rule_field_name(Gen *g) {
assert(g->lex_cur_rule);
g->lex_field_name_begin = g->lex_pos;
}
static void end_rule_field_name(Gen *g) {
assert(g->lex_cur_rule);
char *ptr = &buf_ptr(g->in_buf)[g->lex_field_name_begin];
int len = g->lex_pos - g->lex_field_name_begin;
buf_init_from_mem(&g->lex_cur_rule->tuple.union_field_name, ptr, len);
}
static void begin_fn_name(Gen *g) {
g->lex_fn_name_begin = g->lex_pos;
lex_push_stack(g);
@ -478,15 +511,13 @@ static void end_token_name(Gen *g) {
buf_init_from_mem(&token_name, ptr, len);
Token *token = find_or_create_token(g, &token_name);
RuleNode *node = allocate<RuleNode>(1);
RuleNode *node = create_rule_node(g);
node->type = RuleNodeTypeToken;
node->token.token = token;
assert(g->lex_cur_rule->type == RuleNodeTypeTuple);
g->lex_cur_rule->tuple.children.append(node);
g->biggest_tuple_len = max(g->biggest_tuple_len, g->lex_cur_rule->tuple.children.length);
lex_pop_stack(g);
}
@ -504,6 +535,36 @@ static void end_tuple_body(Gen *g) {
buf_init_from_mem(&g->lex_cur_rule->tuple.body, ptr, len);
}
static void begin_sub_tuple(Gen *g) {
g->lex_sub_tuple_begin = g->lex_pos;
lex_push_stack(g);
}
static void end_sub_tuple(Gen *g) {
assert(g->lex_cur_rule->type == RuleNodeTypeTuple);
char *ptr = &buf_ptr(g->in_buf)[g->lex_sub_tuple_begin];
int len = g->lex_pos - g->lex_sub_tuple_begin;
RuleNode *node = create_rule_node(g);
node->type = RuleNodeTypeSubRule;
buf_init_from_mem(&node->sub_rule.name, ptr, len);
g->lex_cur_rule->tuple.children.append(node);
lex_pop_stack(g);
}
static RuleNode *find_rule_node(Gen *g, Buf *name) {
for (int i = 0; i < g->rules.length; i += 1) {
RuleNode *node = g->rules.at(i);
assert(node->type == RuleNodeTypeTuple);
if (buf_eql_buf(&node->tuple.name, name)) {
return node;
}
}
return nullptr;
}
static void initialize_rules(Gen *g) {
g->lex_state = LexStateStart;
for (g->lex_pos = 0; g->lex_pos < buf_len(g->in_buf); g->lex_pos += 1) {
@ -524,18 +585,36 @@ static void initialize_rules(Gen *g) {
break;
case LexStateRuleName:
switch (c) {
case WHITESPACE:
case '<':
end_rule_name(g);
g->lex_state = LexStateWaitForColon;
break;
case ':':
end_rule_name(g);
g->lex_state = LexStateTupleRule;
g->lex_state = LexStateRuleFieldNameStart;
break;
case SYMBOL_CHAR:
// ok
break;
default:
lex_error(g, "invalid char: '%c'", c);
lex_error(g, "expected '<', not '%c'", c);
}
break;
case LexStateRuleFieldNameStart:
switch (c) {
case SYMBOL_CHAR:
begin_rule_field_name(g);
g->lex_state = LexStateRuleFieldName;
break;
default:
lex_error(g, "expected field name, not '%c'", c);
}
break;
case LexStateRuleFieldName:
switch (c) {
case SYMBOL_CHAR:
// ok
break;
case '>':
end_rule_field_name(g);
g->lex_state = LexStateWaitForColon;
break;
}
break;
case LexStateWaitForColon:
@ -559,12 +638,16 @@ static void initialize_rules(Gen *g) {
begin_fn_name(g);
g->lex_state = LexStateFnName;
break;
case UPPER_ALPHA:
begin_sub_tuple(g);
g->lex_state = LexStateSubTupleName;
break;
case '{':
begin_tuple_body(g);
g->lex_state = LexStateBody;
break;
default:
lex_error(g, "invalid char: '%c'", c);
lex_error(g, "expected rule, not '%c'", c);
}
break;
case LexStateFnName:
@ -589,7 +672,7 @@ static void initialize_rules(Gen *g) {
g->lex_state = LexStateToken;
break;
default:
lex_error(g, "invalid char '%c'", c);
lex_error(g, "expected token name, not '%c'", c);
}
break;
case LexStateToken:
@ -601,7 +684,7 @@ static void initialize_rules(Gen *g) {
end_token_name(g);
break;
default:
lex_error(g, "invalid char '%c'", c);
lex_error(g, "expected token name or ')', not '%c'", c);
}
break;
case LexStateBody:
@ -627,6 +710,20 @@ static void initialize_rules(Gen *g) {
default:
lex_error(g, "expected ';' or '|'");
}
break;
case LexStateSubTupleName:
switch (c) {
case ALPHA:
// ignore
break;
case WHITESPACE:
end_sub_tuple(g);
assert(g->lex_state == LexStateTupleRule);
break;
default:
lex_error(g, "expected rule name, not '%c'", c);
}
break;
}
if (c == '\n') {
g->lex_line += 1;
@ -647,9 +744,40 @@ static void initialize_rules(Gen *g) {
case LexStateTokenStart:
case LexStateToken:
case LexStateBody:
case LexStateSubTupleName:
case LexStateRuleFieldNameStart:
case LexStateRuleFieldName:
lex_error(g, "unexpected EOF");
break;
}
// Resolve child references into pointers
for (int tuple_i = 0; tuple_i < g->rules.length; tuple_i += 1) {
RuleNode *node = g->rules.at(tuple_i);
assert(node->type == RuleNodeTypeTuple);
for (int child_i = 0; child_i < node->tuple.children.length; child_i += 1) {
RuleNode *child = node->tuple.children.at(child_i);
if (child->type == RuleNodeTypeSubRule) {
int line = child->lex_line + 1;
int column = child->lex_column + 1;
RuleNode *referenced_node = find_rule_node(g, &child->sub_rule.name);
if (!referenced_node) {
fprintf(stderr, "Grammar Error: Line %d, column %d: Rule not defined: '%s'\n",
line, column, buf_ptr(&child->sub_rule.name));
}
child->sub_rule.child = referenced_node;
}
}
}
// calculate the biggest tuple len
for (int i = 0; i < g->rules.length; i += 1) {
RuleNode *node = g->rules.at(i);
assert(node->type == RuleNodeTypeTuple);
g->biggest_tuple_len = max(g->biggest_tuple_len, node->tuple.children.length);
}
}
enum TemplateState {
@ -828,6 +956,8 @@ int main(int argc, char **argv) {
fprintf(out_f, " state = transition[%d][token->id];\n", state->index);
break;
case CodeGenTypeError:
fprintf(out_f, " token_index -= 1;\n");
fprintf(out_f, " token = &tokens->at(token_index);\n");
fprintf(out_f, " ast_error(token, \"%s\");\n", buf_ptr(code->error.msg));
break;
case CodeGenTypeSave:
@ -843,7 +973,12 @@ int main(int argc, char **argv) {
fprintf(out_f, "%s\n", buf_ptr(code_text));
fprintf(out_f, " return root;\n");
} else {
zig_panic("TODO capture non-root");
fprintf(out_f, " ParserGenNode *parent_node = stack.at(stack.length - 2);\n");
Buf *dest = buf_sprintf("parent_node->data[parent_node->next_index++].%s",
buf_ptr(code->capture.union_field_name));
Buf *code_text = fill_template(code->capture.body, buf_ptr(dest),
code->capture.field_names);
fprintf(out_f, "%s\n", buf_ptr(code_text));
}
break;
case CodeGenTypePopNode:

View File

@ -36,15 +36,15 @@ enum TokenId {
// TODO: debug delete this
enum TokenId {
TokenIdStar = 0,
TokenIdLParen = 1,
TokenIdLParen = 0,
TokenIdRParen = 1,
TokenIdEof = 2,
TokenIdStar = 3,
TokenIdSymbol,
TokenIdKeywordFn,
TokenIdKeywordReturn,
TokenIdKeywordMut,
TokenIdKeywordConst,
TokenIdRParen,
TokenIdComma,
TokenIdLBrace,
TokenIdRBrace,