std zig tokenizer: don't require 3 newlines at the end of the source

2026-02-21 16:54:52 +00:00 · 2018-02-10 14:52:39 -05:00 · 2018-02-10 14:52:39 -05:00 · 8c31eaf2a8
commit 8c31eaf2a8
parent a2bd9f8912
4 changed files with 14 additions and 17 deletions
--- a/src-self-hosted/main.zig
+++ b/src-self-hosted/main.zig
@ -565,6 +565,15 @@ fn fmtMain(allocator: &mem.Allocator, file_paths: []const []const u8) !void {
        var file = try io.File.openRead(allocator, file_path);
        defer file.close();

+        const source_code = io.readFileAlloc(allocator, file_path) catch |err| {
+            warn("unable to open '{}': {}", file_path, err);
+            continue;
+        };
+        defer allocator.free(source_code);
+
+        var tokenizer = std.zig.Tokenizer.init(source_code);
+        var parser = std.zig.Parser.init(&tokenizer, allocator, file_path);
+        defer parser.deinit();
        warn("opened {} (todo tokenize and parse and render)\n", file_path);
    }
 }
--- a/src-self-hosted/module.zig
+++ b/src-self-hosted/module.zig
@ -213,14 +213,11 @@ pub const Module = struct {
        };
        errdefer self.allocator.free(root_src_real_path);

-        const source_code = io.readFileAllocExtra(self.allocator, root_src_real_path, 3) catch |err| {
+        const source_code = io.readFileAlloc(self.allocator, root_src_real_path) catch |err| {
            try printError("unable to open '{}': {}", root_src_real_path, err);
            return err;
        };
        errdefer self.allocator.free(source_code);
-        source_code[source_code.len - 3] = '\n';
-        source_code[source_code.len - 2] = '\n';
-        source_code[source_code.len - 1] = '\n';

        warn("====input:====\n");

--- a/std/io.zig
+++ b/std/io.zig
@ -524,16 +524,11 @@ pub fn writeFile(allocator: &mem.Allocator, path: []const u8, data: []const u8)

 /// On success, caller owns returned buffer.
 pub fn readFileAlloc(allocator: &mem.Allocator, path: []const u8) ![]u8 {
-    return readFileAllocExtra(allocator, path, 0);
-}
-/// On success, caller owns returned buffer.
-/// Allocates extra_len extra bytes at the end of the file buffer, which are uninitialized.
-pub fn readFileAllocExtra(allocator: &mem.Allocator, path: []const u8, extra_len: usize) ![]u8 {
    var file = try File.openRead(allocator, path);
    defer file.close();

    const size = try file.getEndPos();
-    const buf = try allocator.alloc(u8, size + extra_len);
+    const buf = try allocator.alloc(u8, size);
    errdefer allocator.free(buf);

    var adapter = FileInStream.init(&file);
--- a/std/zig/tokenizer.zig
+++ b/std/zig/tokenizer.zig
@ -175,12 +175,7 @@ pub const Tokenizer = struct {
        std.debug.warn("{} \"{}\"\n", @tagName(token.id), self.buffer[token.start..token.end]);
    }

-    /// buffer must end with "\n\n\n". This is so that attempting to decode
-    /// a the 3 trailing bytes of a 4-byte utf8 sequence is never a buffer overflow.
    pub fn init(buffer: []const u8) Tokenizer {
-        std.debug.assert(buffer[buffer.len - 1] == '\n');
-        std.debug.assert(buffer[buffer.len - 2] == '\n');
-        std.debug.assert(buffer[buffer.len - 3] == '\n');
        return Tokenizer {
            .buffer = buffer,
            .index = 0,
@ -556,8 +551,9 @@ pub const Tokenizer = struct {
        } else {
            // check utf8-encoded character.
            const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1;
-            // the last 3 bytes in the buffer are guaranteed to be '\n',
-            // which means we don't need to do any bounds checking here.
+            if (self.index + length >= self.buffer.len) {
+                return u3(self.buffer.len - self.index);
+            }
            const bytes = self.buffer[self.index..self.index + length];
            switch (length) {
                2 => {