generated parser understands tuples

This commit is contained in:
Andrew Kelley 2015-11-06 22:11:47 -07:00
parent 4ecb37a8a4
commit 72be61fc0a
7 changed files with 244 additions and 51 deletions

View File

@ -26,6 +26,8 @@ readable, safe, optimal, and concise code to solve any computing problem.
* Resilient to parsing errors to make IDE integration work well. * Resilient to parsing errors to make IDE integration work well.
* Source code is UTF-8. * Source code is UTF-8.
* Shebang line OK so language can be used for "scripting" as well. * Shebang line OK so language can be used for "scripting" as well.
* Ability to mark functions as test and automatically run them in test mode.
* Memory zeroed by default, unless you initialize with "uninitialized".
## Roadmap ## Roadmap

View File

@ -36,7 +36,7 @@ void buf_appendf(Buf *buf, const char *format, ...) {
int orig_len = buf_len(buf); int orig_len = buf_len(buf);
buf_resize(buf, orig_len + required_size); buf_resize(buf, orig_len + len1);
int len2 = vsnprintf(buf_ptr(buf) + orig_len, required_size, format, ap2); int len2 = vsnprintf(buf_ptr(buf) + orig_len, required_size, format, ap2);
assert(len2 == len1); assert(len2 == len1);

View File

@ -55,6 +55,14 @@ static inline void buf_init_from_mem(Buf *buf, const char *ptr, int len) {
buf->list.at(buf_len(buf)) = 0; buf->list.at(buf_len(buf)) = 0;
} }
static inline void buf_init_from_str(Buf *buf, const char *str) {
buf_init_from_mem(buf, str, strlen(str));
}
static inline void buf_init_from_buf(Buf *buf, Buf *other) {
buf_init_from_mem(buf, buf_ptr(other), buf_len(other));
}
static inline Buf *buf_create_from_mem(const char *ptr, int len) { static inline Buf *buf_create_from_mem(const char *ptr, int len) {
Buf *buf = allocate<Buf>(1); Buf *buf = allocate<Buf>(1);
buf_init_from_mem(buf, ptr, len); buf_init_from_mem(buf, ptr, len);

View File

@ -28,7 +28,6 @@ static int usage(const char *arg0) {
fprintf(stderr, "Usage: %s [command] [options] target\n" fprintf(stderr, "Usage: %s [command] [options] target\n"
"Commands:\n" "Commands:\n"
" build create an executable from target\n" " build create an executable from target\n"
" link turn a .o file into an executable\n"
"Options:\n" "Options:\n"
" --output output file\n" " --output output file\n"
" --version print version number and exit\n" " --version print version number and exit\n"

View File

@ -78,3 +78,7 @@ void ast_print(AstNode *node, int indent) {
break; break;
} }
} }
AstNode *ast_create_root(Token *token) {
return nullptr;
}

View File

@ -89,4 +89,6 @@ const char *node_type_str(NodeType node_type);
void ast_print(AstNode *node, int indent); void ast_print(AstNode *node, int indent);
AstNode *ast_create_root(Token *token);
#endif #endif

View File

@ -187,29 +187,39 @@ struct RuleNode {
}; };
enum ParserStateType { enum CodeGenType {
ParserStateTypeError, CodeGenTypeTransition,
ParserStateTypeOk, CodeGenTypeError,
ParserStateTypeCapture, CodeGenTypeSave,
CodeGenTypePushNode,
CodeGenTypeCapture,
CodeGenTypePopNode,
CodeGenTypeEatToken,
}; };
struct ParserStateError { struct CodeGenError {
Buf *msg; Buf *msg;
}; };
struct ParserStateCapture { struct CodeGenCapture {
Buf *body; Buf *body;
bool is_root;
Buf *field_names;
};
struct CodeGen {
CodeGenType type;
union {
CodeGenError error;
CodeGenCapture capture;
};
}; };
struct ParserState { struct ParserState {
ParserStateType type; ZigList<CodeGen *> code_gen_list;
// One for each token ID. // One for each token ID.
ParserState **transition; ParserState **transition;
int index; int index;
union {
ParserStateError error;
ParserStateCapture capture;
};
}; };
enum LexState { enum LexState {
@ -234,6 +244,7 @@ struct Gen {
ZigList<ParserState *> transition_table; ZigList<ParserState *> transition_table;
ZigList<Token *> tokens; ZigList<Token *> tokens;
RuleNode *root; RuleNode *root;
int biggest_tuple_len;
Buf *in_buf; Buf *in_buf;
LexState lex_state; LexState lex_state;
@ -249,9 +260,8 @@ struct Gen {
int lex_body_end; int lex_body_end;
}; };
static ParserState *create_state(Gen *g, ParserStateType type) { static ParserState *create_state(Gen *g) {
ParserState *state = allocate<ParserState>(1); ParserState *state = allocate<ParserState>(1);
state->type = type;
state->index = g->transition_table.length; state->index = g->transition_table.length;
state->transition = allocate<ParserState*>(g->tokens.length); state->transition = allocate<ParserState*>(g->tokens.length);
g->transition_table.append(state); g->transition_table.append(state);
@ -264,28 +274,93 @@ static void fill_state_with_transition(Gen *g, ParserState *source, ParserState
} }
} }
static void gen(Gen *g, RuleNode *node) { static void state_add_code(ParserState *state, CodeGen *code) {
state->code_gen_list.append(code);
}
static void state_add_save_token(ParserState *state) {
CodeGen *code = allocate<CodeGen>(1);
code->type = CodeGenTypeSave;
state_add_code(state, code);
}
static void state_add_error(ParserState *state, Buf *msg) {
CodeGen *code = allocate<CodeGen>(1);
code->type = CodeGenTypeError;
code->error.msg = msg;
state_add_code(state, code);
}
static void state_add_transition(ParserState *state) {
CodeGen *code = allocate<CodeGen>(1);
code->type = CodeGenTypeTransition;
state_add_code(state, code);
}
static void state_add_push_node(ParserState *state) {
CodeGen *code = allocate<CodeGen>(1);
code->type = CodeGenTypePushNode;
state_add_code(state, code);
}
static CodeGen *codegen_create_capture(Buf *body, bool is_root, int field_name_count) {
CodeGen *code = allocate<CodeGen>(1);
code->type = CodeGenTypeCapture;
code->capture.body = body;
code->capture.is_root = is_root;
code->capture.field_names = allocate<Buf>(field_name_count);
return code;
}
static void state_add_pop_node(ParserState *state) {
CodeGen *code = allocate<CodeGen>(1);
code->type = CodeGenTypePopNode;
state_add_code(state, code);
}
static void state_add_eat_token(ParserState *state) {
CodeGen *code = allocate<CodeGen>(1);
code->type = CodeGenTypeEatToken;
state_add_code(state, code);
}
static void gen(Gen *g, RuleNode *node, Buf *out_field_name) {
switch (node->type) { switch (node->type) {
case RuleNodeTypeToken: case RuleNodeTypeToken:
{ {
ParserState *ok_state = create_state(g, ParserStateTypeOk); buf_init_from_str(out_field_name, "token");
ParserState *err_state = create_state(g, ParserStateTypeError);
err_state->error.msg = buf_sprintf("expected token '%s'", buf_ptr(&node->token.token->name)); state_add_save_token(g->cur_state);
ParserState *ok_state = create_state(g);
ParserState *err_state = create_state(g);
state_add_error(err_state, buf_sprintf("expected token '%s'", buf_ptr(&node->token.token->name)));
fill_state_with_transition(g, g->cur_state, err_state); fill_state_with_transition(g, g->cur_state, err_state);
g->cur_state->transition[node->token.token->id] = ok_state; g->cur_state->transition[node->token.token->id] = ok_state;
state_add_transition(g->cur_state);
state_add_eat_token(g->cur_state);
g->cur_state = ok_state; g->cur_state = ok_state;
} }
break; break;
case RuleNodeTypeTuple: case RuleNodeTypeTuple:
{ {
buf_init_from_str(out_field_name, "node");
state_add_push_node(g->cur_state);
bool is_root = (node == g->root);
int field_name_count = node->tuple.children.length;
CodeGen *code = codegen_create_capture(&node->tuple.body, is_root, field_name_count);
for (int i = 0; i < node->tuple.children.length; i += 1) { for (int i = 0; i < node->tuple.children.length; i += 1) {
RuleNode *child = node->tuple.children.at(i); RuleNode *child = node->tuple.children.at(i);
gen(g, child); gen(g, child, &code->capture.field_names[i]);
} }
g->cur_state->type = ParserStateTypeCapture; state_add_code(g->cur_state, code);
g->cur_state->capture.body = &node->tuple.body;
state_add_pop_node(g->cur_state);
} }
break; break;
case RuleNodeTypeMany: case RuleNodeTypeMany:
@ -301,7 +376,10 @@ static void gen(Gen *g, RuleNode *node) {
zig_panic("TODO"); zig_panic("TODO");
break; break;
case RuleNodeTypeSubRule: case RuleNodeTypeSubRule:
zig_panic("TODO"); {
RuleNode *child = node->sub_rule.child;
gen(g, child, out_field_name);
}
break; break;
} }
} }
@ -407,6 +485,9 @@ static void end_token_name(Gen *g) {
assert(g->lex_cur_rule->type == RuleNodeTypeTuple); assert(g->lex_cur_rule->type == RuleNodeTypeTuple);
g->lex_cur_rule->tuple.children.append(node); g->lex_cur_rule->tuple.children.append(node);
g->biggest_tuple_len = max(g->biggest_tuple_len, g->lex_cur_rule->tuple.children.length);
lex_pop_stack(g); lex_pop_stack(g);
} }
@ -571,6 +652,76 @@ static void initialize_rules(Gen *g) {
} }
} }
enum TemplateState {
TemplateStateStart,
TemplateStateDollar,
TemplateStateNumber,
};
static Buf *fill_template(Buf *body, const char *result_name, Buf *field_names) {
//fprintf(stderr, "fill template input:\n%s\n", buf_ptr(body));
Buf *result = buf_alloc();
TemplateState state = TemplateStateStart;
int digit_start;
for (int i = 0; i < buf_len(body); i += 1) {
uint8_t c = buf_ptr(body)[i];
switch (state) {
case TemplateStateStart:
switch (c) {
case '$':
state = TemplateStateDollar;
break;
default:
buf_append_char(result, c);
break;
}
break;
case TemplateStateDollar:
switch (c) {
case '$':
buf_append_str(result, result_name);
state = TemplateStateStart;
break;
case DIGIT:
digit_start = i;
state = TemplateStateNumber;
break;
default:
buf_append_char(result, '$');
buf_append_char(result, c);
state = TemplateStateStart;
break;
}
break;
case TemplateStateNumber:
switch (c) {
case DIGIT:
// nothing
break;
default:
{
Buf *num_buf = buf_create_from_mem(&buf_ptr(body)[digit_start], i - digit_start);
int index = atoi(buf_ptr(num_buf)) - 1;
buf_appendf(result, "(top_node->data[%d].%s)%c",
index, buf_ptr(&field_names[index]), c);
state = TemplateStateStart;
}
break;
}
break;
}
}
switch (state) {
case TemplateStateStart:
// OK
break;
default:
zig_panic("unable to fill grammar template");
}
//fprintf(stderr, "fill template output:\n%s\n", buf_ptr(result));
return result;
}
int main(int argc, char **argv) { int main(int argc, char **argv) {
const char *in_filename = argv[1]; const char *in_filename = argv[1];
@ -603,8 +754,9 @@ int main(int argc, char **argv) {
g.root = g.rules.at(0); g.root = g.rules.at(0);
g.cur_state = create_state(&g, ParserStateTypeOk); g.cur_state = create_state(&g);
gen(&g, g.root); Buf root_field_name = {0};
gen(&g, g.root, &root_field_name);
fprintf(out_f, "/* This file is generated by parsergen.cpp */\n"); fprintf(out_f, "/* This file is generated by parsergen.cpp */\n");
fprintf(out_f, "\n"); fprintf(out_f, "\n");
@ -627,15 +779,14 @@ int main(int argc, char **argv) {
} }
fprintf(out_f, "\n"); fprintf(out_f, "\n");
/* TODO fprintf(out_f, "struct ParserGenNode {\n");
fprintf(out_f, "struct ParserGenNode{\n"); fprintf(out_f, " int next_index;\n");
fprintf(out_f, " union {\n"); fprintf(out_f, " union {\n");
fprintf(out_f, " [%d];\n", biggest_tuple_len);
fprintf(out_f, " Token *token;\n"); fprintf(out_f, " Token *token;\n");
fprintf(out_f, " };\n"); fprintf(out_f, " AstNode *node;\n");
fprintf(out_f, " } data[%d];\n", g.biggest_tuple_len);
fprintf(out_f, "};\n"); fprintf(out_f, "};\n");
fprintf(out_f, "\n"); fprintf(out_f, "\n");
*/
fprintf(out_f, "AstNode * ast_parse(Buf *buf, ZigList<Token> *tokens) {\n"); fprintf(out_f, "AstNode * ast_parse(Buf *buf, ZigList<Token> *tokens) {\n");
@ -655,39 +806,66 @@ int main(int argc, char **argv) {
fprintf(out_f, " int state = 0;\n"); fprintf(out_f, " int state = 0;\n");
fprintf(out_f, " int token_index = 0;\n");
fprintf(out_f, " Token *token = &tokens->at(token_index);\n");
fprintf(out_f, " AstNode *root = nullptr;\n"); fprintf(out_f, " AstNode *root = nullptr;\n");
fprintf(out_f, " ZigList<ParserGenNode *> stack = {0};\n");
fprintf(out_f, " ParserGenNode *top_node = nullptr;\n");
fprintf(out_f, " for (int i = 0; i < tokens->length; i += 1) {\n"); fprintf(out_f, " for (;;) {\n");
fprintf(out_f, " Token *token = &tokens->at(i);\n");
fprintf(out_f, " switch (state) {\n"); fprintf(out_f, " switch (state) {\n");
for (int i = 0; i < g.transition_table.length; i += 1) { for (int state_i = 0; state_i < g.transition_table.length; state_i += 1) {
ParserState *state = g.transition_table.at(i); ParserState *state = g.transition_table.at(state_i);
fprintf(out_f, " case %d:\n", i); fprintf(out_f, " case %d: {\n", state_i);
switch (state->type) { for (int code_i = 0; code_i < state->code_gen_list.length; code_i += 1) {
case ParserStateTypeError: CodeGen *code = state->code_gen_list.at(code_i);
fprintf(out_f, " ast_error(token, \"%s\");\n", buf_ptr(state->error.msg)); switch (code->type) {
break; case CodeGenTypeTransition:
case ParserStateTypeOk: fprintf(out_f, " assert(transition[%d][token->id] >= 0);\n", state->index);
fprintf(out_f, " assert(transition[%d][token->id] >= 0);\n", state->index); fprintf(out_f, " assert(transition[%d][token->id] < %d);\n",
fprintf(out_f, " assert(transition[%d][token->id] < %d);\n", state->index, g.transition_table.length);
state->index, g.transition_table.length); fprintf(out_f, " state = transition[%d][token->id];\n", state->index);
fprintf(out_f, " state = transition[%d][token->id];\n", state->index); break;
break; case CodeGenTypeError:
case ParserStateTypeCapture: fprintf(out_f, " ast_error(token, \"%s\");\n", buf_ptr(code->error.msg));
// TODO fprintf(out_f, " %s\n", buf_ptr(state->capture.body)); break;
fprintf(out_f, " state = transition[%d][token->id];\n", state->index); case CodeGenTypeSave:
break; fprintf(out_f, " top_node->data[top_node->next_index++].token = token;\n");
break;
case CodeGenTypePushNode:
fprintf(out_f, " top_node = allocate<ParserGenNode>(1);\n");
fprintf(out_f, " stack.append(top_node);\n");
break;
case CodeGenTypeCapture:
if (code->capture.is_root) {
Buf *code_text = fill_template(code->capture.body, "root", code->capture.field_names);
fprintf(out_f, "%s\n", buf_ptr(code_text));
fprintf(out_f, " return root;\n");
} else {
zig_panic("TODO capture non-root");
}
break;
case CodeGenTypePopNode:
fprintf(out_f, " stack.pop();\n");
fprintf(out_f, " top_node = stack.length ? stack.last() : nullptr;\n");
break;
case CodeGenTypeEatToken:
fprintf(out_f, " token_index += 1;\n");
fprintf(out_f, " token = (token_index < tokens->length) ? &tokens->at(token_index) : nullptr;\n");
break;
}
} }
fprintf(out_f, " break;\n"); fprintf(out_f, " break;\n");
fprintf(out_f, " }\n");
} }
fprintf(out_f, " default:\n"); fprintf(out_f, " default:\n");
fprintf(out_f, " zig_panic(\"unreachable\");\n"); fprintf(out_f, " zig_panic(\"unreachable\");\n");
fprintf(out_f, " }\n"); fprintf(out_f, " }\n");
fprintf(out_f, " }\n"); fprintf(out_f, " }\n");
fprintf(out_f, " return root;\n"); fprintf(out_f, " zig_panic(\"unreachable\");\n");
fprintf(out_f, "}\n"); fprintf(out_f, "}\n");
return 0;
} }