zig/lib/compiler/resinator/comments.zig

//! Expects to run after a C preprocessor step that preserves comments.
//!
//! `rc` has a peculiar quirk where something like `blah/**/blah` will be
//! transformed into `blahblah` during parsing. However, `clang -E` will
//! transform it into `blah blah`, so in order to match `rc`, we need
//! to remove comments ourselves after the preprocessor runs.
//! Note: Multiline comments that actually span more than one line do
//!       get translated to a space character by `rc`.
//!
//! Removing comments before lexing also allows the lexer to not have to
//! deal with comments which would complicate its implementation (this is something
//! of a tradeoff, as removing comments in a separate pass means that we'll
//! need to iterate the source twice instead of once, but having to deal with
//! comments when lexing would be a pain).

const std = @import("std");
const Allocator = std.mem.Allocator;
const UncheckedSliceWriter = @import("utils.zig").UncheckedSliceWriter;
const SourceMappings = @import("source_mapping.zig").SourceMappings;
const LineHandler = @import("lex.zig").LineHandler;
const formsLineEndingPair = @import("source_mapping.zig").formsLineEndingPair;

/// `buf` must be at least as long as `source`
/// In-place transformation is supported (i.e. `source` and `buf` can be the same slice)
pub fn removeComments(source: []const u8, buf: []u8, source_mappings: ?*SourceMappings) ![]u8 {
    std.debug.assert(buf.len >= source.len);
    var result = UncheckedSliceWriter{ .slice = buf };
    const State = enum {
        start,
        forward_slash,
        line_comment,
        multiline_comment,
        multiline_comment_end,
        single_quoted,
        single_quoted_escape,
        double_quoted,
        double_quoted_escape,
    };
    var state: State = .start;
    var index: usize = 0;
    var pending_start: ?usize = null;
    var line_handler = LineHandler{ .buffer = source };
    while (index < source.len) : (index += 1) {
        const c = source[index];
        // TODO: Disallow \x1A, \x00, \x7F in comments. At least \x1A and \x00 can definitely
        //       cause errors or parsing weirdness in the Win32 RC compiler. These are disallowed
        //       in the lexer, but comments are stripped before getting to the lexer.
        switch (state) {
            .start => switch (c) {
                '/' => {
                    state = .forward_slash;
                    pending_start = index;
                },
                '\r', '\n' => {
                    _ = line_handler.incrementLineNumber(index);
                    result.write(c);
                },
                else => {
                    switch (c) {
                        '"' => state = .double_quoted,
                        '\'' => state = .single_quoted,
                        else => {},
                    }
                    result.write(c);
                },
            },
            .forward_slash => switch (c) {
                '/' => state = .line_comment,
                '*' => {
                    state = .multiline_comment;
                },
                else => {
                    _ = line_handler.maybeIncrementLineNumber(index);
                    result.writeSlice(source[pending_start.? .. index + 1]);
                    pending_start = null;
                    state = .start;
                },
            },
            .line_comment => switch (c) {
                '\r', '\n' => {
                    _ = line_handler.incrementLineNumber(index);
                    result.write(c);
                    state = .start;
                },
                else => {},
            },
            .multiline_comment => switch (c) {
                '\r' => try handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings),
                '\n' => {
                    _ = line_handler.incrementLineNumber(index);
                    result.write(c);
                },
                '*' => state = .multiline_comment_end,
                else => {},
            },
            .multiline_comment_end => switch (c) {
                '\r' => {
                    try handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings);
                    // We only want to treat this as a newline if it's part of a CRLF pair. If it's
                    // not, then we still want to stay in .multiline_comment_end, so that e.g. `*<\r>/` still
                    // functions as a `*/` comment ending. Kinda crazy, but that's how the Win32 implementation works.
                    if (formsLineEndingPair(source, '\r', index + 1)) {
                        state = .multiline_comment;
                    }
                },
                '\n' => {
                    _ = line_handler.incrementLineNumber(index);
                    result.write(c);
                    state = .multiline_comment;
                },
                '/' => {
                    state = .start;
                },
                else => {
                    state = .multiline_comment;
                },
            },
            .single_quoted => switch (c) {
                '\r', '\n' => {
                    _ = line_handler.incrementLineNumber(index);
                    state = .start;
                    result.write(c);
                },
                '\\' => {
                    state = .single_quoted_escape;
                    result.write(c);
                },
                '\'' => {
                    state = .start;
                    result.write(c);
                },
                else => {
                    result.write(c);
                },
            },
            .single_quoted_escape => switch (c) {
                '\r', '\n' => {
                    _ = line_handler.incrementLineNumber(index);
                    state = .start;
                    result.write(c);
                },
                else => {
                    state = .single_quoted;
                    result.write(c);
                },
            },
            .double_quoted => switch (c) {
                '\r', '\n' => {
                    _ = line_handler.incrementLineNumber(index);
                    state = .start;
                    result.write(c);
                },
                '\\' => {
                    state = .double_quoted_escape;
                    result.write(c);
                },
                '"' => {
                    state = .start;
                    result.write(c);
                },
                else => {
                    result.write(c);
                },
            },
            .double_quoted_escape => switch (c) {
                '\r', '\n' => {
                    _ = line_handler.incrementLineNumber(index);
                    state = .start;
                    result.write(c);
                },
                else => {
                    state = .double_quoted;
                    result.write(c);
                },
            },
        }
    }
    return result.getWritten();
}

inline fn handleMultilineCarriageReturn(
    source: []const u8,
    line_handler: *LineHandler,
    index: usize,
    result: *UncheckedSliceWriter,
    source_mappings: ?*SourceMappings,
) !void {
    // This is a dumb way to go about this, but basically we want to determine
    // if this is part of a distinct CRLF or LFCR pair. This function call will detect
    // LFCR pairs correctly since the function we're in will only be called on CR,
    // but will not detect CRLF pairs since it only looks at the line ending before the
    // CR. So, we do a second (forward) check if the first fails to detect CRLF that is
    // not part of another pair.
    const is_lfcr_pair = line_handler.currentIndexFormsLineEndingPair(index);
    const is_crlf_pair = !is_lfcr_pair and formsLineEndingPair(source, '\r', index + 1);
    // Note: Bare \r within a multiline comment should *not* be treated as a line ending for the
    // purposes of removing comments, but *should* be treated as a line ending for the
    // purposes of line counting/source mapping
    _ = line_handler.incrementLineNumber(index);
    // So only write the \r if it's part of a CRLF/LFCR pair
    if (is_lfcr_pair or is_crlf_pair) {
        result.write('\r');
    }
    // And otherwise, we want to collapse the source mapping so that we can still know which
    // line came from where.
    else {
        // Because the line gets collapsed, we need to decrement line number so that
        // the next collapse acts on the first of the collapsed line numbers
        line_handler.line_number -= 1;
        if (source_mappings) |mappings| {
            try mappings.collapse(line_handler.line_number, 1);
        }
    }
}

pub fn removeCommentsAlloc(allocator: Allocator, source: []const u8, source_mappings: ?*SourceMappings) ![]u8 {
    const buf = try allocator.alloc(u8, source.len);
    errdefer allocator.free(buf);
    const result = try removeComments(source, buf, source_mappings);
    return allocator.realloc(buf, result.len);
}

fn testRemoveComments(expected: []const u8, source: []const u8) !void {
    const result = try removeCommentsAlloc(std.testing.allocator, source, null);
    defer std.testing.allocator.free(result);

    try std.testing.expectEqualStrings(expected, result);
}

test "basic" {
    try testRemoveComments("", "// comment");
    try testRemoveComments("", "/* comment */");
}

test "mixed" {
    try testRemoveComments("hello", "hello// comment");
    try testRemoveComments("hello", "hel/* comment */lo");
}

test "within a string" {
    // escaped " is \"
    try testRemoveComments(
        \\blah"//som\"/*ething*/"BLAH
    ,
        \\blah"//som\"/*ething*/"BLAH
    );
}

test "line comments retain newlines" {
    try testRemoveComments(
        \\
        \\
        \\
    ,
        \\// comment
        \\// comment
        \\// comment
    );

    try testRemoveComments("\r\n", "//comment\r\n");
}

test "unfinished multiline comment" {
    try testRemoveComments(
        \\unfinished
        \\
    ,
        \\unfinished/*
        \\
    );
}

test "crazy" {
    try testRemoveComments(
        \\blah"/*som*/\""BLAH
    ,
        \\blah"/*som*/\""/*ething*/BLAH
    );

    try testRemoveComments(
        \\blah"/*som*/"BLAH RCDATA "BEGIN END
        \\
        \\
        \\hello
        \\"
    ,
        \\blah"/*som*/"/*ething*/BLAH RCDATA "BEGIN END
        \\// comment
        \\//"blah blah" RCDATA {}
        \\hello
        \\"
    );
}

test "multiline comment with newlines" {
    // bare \r is not treated as a newline
    try testRemoveComments("blahblah", "blah/*some\rthing*/blah");

    try testRemoveComments(
        \\blah
        \\blah
    ,
        \\blah/*some
        \\thing*/blah
    );
    try testRemoveComments(
        "blah\r\nblah",
        "blah/*some\r\nthing*/blah",
    );

    // handle *<not /> correctly
    try testRemoveComments(
        \\blah
        \\
        \\
    ,
        \\blah/*some
        \\thing*
        \\/bl*ah*/
    );
}

test "comments appended to a line" {
    try testRemoveComments(
        \\blah
        \\blah
    ,
        \\blah // line comment
        \\blah
    );
    try testRemoveComments(
        "blah \r\nblah",
        "blah // line comment\r\nblah",
    );
}

test "remove comments with mappings" {
    const allocator = std.testing.allocator;
    var mut_source = "blah/*\rcommented line*\r/blah".*;
    var mappings = SourceMappings{};
    _ = try mappings.files.put(allocator, "test.rc");
    try mappings.set(1, 1, 0);
    try mappings.set(2, 2, 0);
    try mappings.set(3, 3, 0);
    defer mappings.deinit(allocator);

    const result = try removeComments(&mut_source, &mut_source, &mappings);

    try std.testing.expectEqualStrings("blahblah", result);
    try std.testing.expectEqual(@as(usize, 1), mappings.end_line);
    try std.testing.expectEqual(@as(usize, 3), mappings.getCorrespondingSpan(1).?.end_line);
}

test "in place" {
    var mut_source = "blah /* comment */ blah".*;
    const result = try removeComments(&mut_source, &mut_source, null);
    try std.testing.expectEqualStrings("blah  blah", result);
}