stage1: memoize strings in the AST

Currently, stage1 runs astgen for every comptime function call, resulting in identifier strings being allocated multiple times, wasting memory. As a workaround until the code is adjusted to make astgen run only once per source node, we memoize the result into the AST. * Rename `ir_gen_*` to `astgen_*` - Oops, meant to do this in a separate commit. My bad. * tokenizer: avoid using designated initializer syntax. MSVC does not support it.
2025-12-27 08:33:15 +00:00 · 2021-05-28 15:22:03 -07:00 · 2021-05-28 15:22:03 -07:00 · f5d4fe3e17
commit f5d4fe3e17
parent 52b3daa90e
5 changed files with 489 additions and 467 deletions
--- a/src/stage1/all_types.hpp
+++ b/src/stage1/all_types.hpp
@ -1123,6 +1123,14 @@ struct AstNodeContainerInitExpr {
    ContainerInitKind kind;
 };

+struct AstNodeIdentifier {
+    Buf *name;
+};
+
+struct AstNodeEnumLiteral {
+    Buf *name;
+};
+
 struct AstNodeBoolLiteral {
    bool value;
 };
@ -1204,6 +1212,12 @@ struct AstNode {
        AstNodeAwaitExpr await_expr;
        AstNodeSuspend suspend;
        AstNodeAnyFrameType anyframe_type;
+
+        // These are part of an astgen workaround to use less memory by
+        // memoizing into the AST. Once astgen is modified to only run once
+        // per corresponding source, this workaround can be removed.
+        AstNodeIdentifier identifier;
+        AstNodeEnumLiteral enum_literal;
    } data;

    // This is a function for use in the debugger to print
--- a/src/stage1/astgen.cpp
+++ b/src/stage1/astgen.cpp
--- a/src/stage1/parser.cpp
+++ b/src/stage1/parser.cpp
@ -202,6 +202,19 @@ static void put_back_token(ParseContext *pc) {
    pc->current_token -= 1;
 }

+static Buf *token_string_literal_buf(RootStruct *root_struct, TokenIndex token) {
+    Error err;
+    assert(root_struct->token_ids[token] == TokenIdStringLiteral);
+    const char *source = buf_ptr(root_struct->source_code);
+    size_t byte_offset = root_struct->token_locs[token].offset;
+    size_t bad_index;
+    Buf *str = buf_alloc();
+    if ((err = source_string_literal_buf(source + byte_offset, str, &bad_index))) {
+        zig_panic("TODO handle string literal parse error");
+    }
+    return str;
+}
+
 static Buf *token_buf(ParseContext *pc, TokenIndex token) {
    if (token == 0)
        return nullptr;
@ -3465,19 +3478,6 @@ Error source_char_literal(const char *source, uint32_t *result, size_t *bad_inde
 }


-Buf *token_string_literal_buf(RootStruct *root_struct, TokenIndex token) {
-    Error err;
-    assert(root_struct->token_ids[token] == TokenIdStringLiteral);
-    const char *source = buf_ptr(root_struct->source_code);
-    size_t byte_offset = root_struct->token_locs[token].offset;
-    size_t bad_index;
-    Buf *str = buf_alloc();
-    if ((err = source_string_literal_buf(source + byte_offset, str, &bad_index))) {
-        zig_panic("TODO handle string literal parse error");
-    }
-    return str;
-}
-
 Buf *token_identifier_buf(RootStruct *root_struct, TokenIndex token) {
    Error err;
    const char *source = buf_ptr(root_struct->source_code);
@ -3515,14 +3515,15 @@ Buf *token_identifier_buf(RootStruct *root_struct, TokenIndex token) {

 Buf *node_identifier_buf(AstNode *node) {
    assert(node->type == NodeTypeIdentifier);
-    RootStruct *root_struct = node->owner->data.structure.root_struct;
-    return token_identifier_buf(root_struct, node->main_token);
-}
-
-Buf *node_string_literal_buf(AstNode *node) {
-    assert(node->type == NodeTypeStringLiteral);
-    RootStruct *root_struct = node->owner->data.structure.root_struct;
-    return token_string_literal_buf(root_struct, node->main_token);
+    // Currently, stage1 runs astgen for every comptime function call,
+    // resulting the allocation here wasting memory. As a workaround until
+    // the code is adjusted to make astgen run only once per source node,
+    // we memoize the result into the AST here.
+    if (node->data.identifier.name == nullptr) {
+        RootStruct *root_struct = node->owner->data.structure.root_struct;
+        node->data.identifier.name = token_identifier_buf(root_struct, node->main_token);
+    }
+    return node->data.identifier.name;
 }

 void token_number_literal_bigint(RootStruct *root_struct, BigInt *result, TokenIndex token) {
--- a/src/stage1/parser.hpp
+++ b/src/stage1/parser.hpp
@ -19,10 +19,8 @@ void ast_print(AstNode *node, int indent);
 void ast_visit_node_children(AstNode *node, void (*visit)(AstNode **, void *context), void *context);

 Buf *node_identifier_buf(AstNode *node);
-Buf *node_string_literal_buf(AstNode *node);

 Buf *token_identifier_buf(RootStruct *root_struct, TokenIndex token);
-Buf *token_string_literal_buf(RootStruct *root_struct, TokenIndex token);

 void token_number_literal_bigint(RootStruct *root_struct, BigInt *result, TokenIndex token);

--- a/src/stage1/tokenizer.cpp
+++ b/src/stage1/tokenizer.cpp
@ -291,11 +291,11 @@ static void tokenize_error(Tokenize *t, const char *format, ...) {

 static void begin_token(Tokenize *t, TokenId id) {
    t->out->ids.append(id);
-    t->out->locs.append({
-        .offset = (uint32_t) t->pos,
-        .line = t->line,
-        .column = t->column,
-    });
+    TokenLoc tok_loc;
+    tok_loc.offset = (uint32_t) t->pos;
+    tok_loc.line = t->line;
+    tok_loc.column = t->column;
+    t->out->locs.append(tok_loc);
 }

 static void cancel_token(Tokenize *t) {