zig/lib/compiler/resinator/lex.zig

//! Expects to be run after the C preprocessor and after `removeComments`.
//! This means that the lexer assumes that:
//! - Splices ('\' at the end of a line) have been handled/collapsed.
//! - Preprocessor directives and macros have been expanded (any remaining should be skipped with the exception of `#pragma code_page`).
//! - All comments have been removed.

const std = @import("std");
const ErrorDetails = @import("errors.zig").ErrorDetails;
const columnWidth = @import("literals.zig").columnWidth;
const code_pages = @import("code_pages.zig");
const SupportedCodePage = code_pages.SupportedCodePage;
const SourceMappings = @import("source_mapping.zig").SourceMappings;
const isNonAsciiDigit = @import("utils.zig").isNonAsciiDigit;

const dumpTokensDuringTests = false;

pub const default_max_string_literal_codepoints = 4097;

pub const Token = struct {
    id: Id,
    start: usize,
    end: usize,
    line_number: usize,

    pub const Id = enum {
        literal,
        number,
        quoted_ascii_string,
        quoted_wide_string,
        operator,
        begin,
        end,
        comma,
        open_paren,
        close_paren,
        /// This Id is only used for errors, the Lexer will never return one
        /// of these from a `next` call.
        preprocessor_command,
        invalid,
        eof,

        pub fn nameForErrorDisplay(self: Id) []const u8 {
            return switch (self) {
                .literal => "<literal>",
                .number => "<number>",
                .quoted_ascii_string => "<quoted ascii string>",
                .quoted_wide_string => "<quoted wide string>",
                .operator => "<operator>",
                .begin => "<'{' or BEGIN>",
                .end => "<'}' or END>",
                .comma => ",",
                .open_paren => "(",
                .close_paren => ")",
                .preprocessor_command => "<preprocessor command>",
                .invalid => unreachable,
                .eof => "<eof>",
            };
        }
    };

    pub fn slice(self: Token, buffer: []const u8) []const u8 {
        return buffer[self.start..self.end];
    }

    /// Returns 0-based column
    pub fn calculateColumn(token: Token, source: []const u8, tab_columns: usize, maybe_line_start: ?usize) usize {
        const line_start = maybe_line_start orelse token.getLineStartForColumnCalc(source);

        var i: usize = line_start;
        var column: usize = 0;
        while (i < token.start) : (i += 1) {
            column += columnWidth(column, source[i], tab_columns);
        }
        return column;
    }

    // TODO: More testing is needed to determine if this can be merged with getLineStartForErrorDisplay
    //       (the TODO in currentIndexFormsLineEndingPair should be taken into account as well)
    pub fn getLineStartForColumnCalc(token: Token, source: []const u8) usize {
        const line_start = line_start: {
            if (token.start != 0) {
                // start checking at the byte before the token
                var index = token.start - 1;
                while (true) {
                    if (source[index] == '\n') break :line_start @min(source.len - 1, index + 1);
                    if (index != 0) index -= 1 else break;
                }
            }
            break :line_start 0;
        };
        return line_start;
    }

    pub fn getLineStartForErrorDisplay(token: Token, source: []const u8) usize {
        const line_start = line_start: {
            if (token.start != 0) {
                // start checking at the byte before the token
                var index = token.start - 1;
                while (true) {
                    if (source[index] == '\r' or source[index] == '\n') break :line_start @min(source.len - 1, index + 1);
                    if (index != 0) index -= 1 else break;
                }
            }
            break :line_start 0;
        };
        return line_start;
    }

    pub fn getLineForErrorDisplay(token: Token, source: []const u8, maybe_line_start: ?usize) []const u8 {
        const line_start = maybe_line_start orelse token.getLineStartForErrorDisplay(source);

        var line_end = line_start;
        while (line_end < source.len and source[line_end] != '\r' and source[line_end] != '\n') : (line_end += 1) {}
        return source[line_start..line_end];
    }

    pub fn isStringLiteral(token: Token) bool {
        return token.id == .quoted_ascii_string or token.id == .quoted_wide_string;
    }
};

pub const LineHandler = struct {
    line_number: usize = 1,
    buffer: []const u8,
    last_line_ending_index: ?usize = null,

    /// Like incrementLineNumber but checks that the current char is a line ending first.
    /// Returns the new line number if it was incremented, null otherwise.
    pub fn maybeIncrementLineNumber(self: *LineHandler, cur_index: usize) ?usize {
        const c = self.buffer[cur_index];
        if (c == '\r' or c == '\n') {
            return self.incrementLineNumber(cur_index);
        }
        return null;
    }

    /// Increments line_number appropriately (handling line ending pairs)
    /// and returns the new line number if it was incremented, or null otherwise.
    pub fn incrementLineNumber(self: *LineHandler, cur_index: usize) ?usize {
        if (self.currentIndexFormsLineEndingPair(cur_index)) {
            self.last_line_ending_index = null;
            return null;
        } else {
            self.line_number += 1;
            self.last_line_ending_index = cur_index;
            return self.line_number;
        }
    }

    /// \r\n and \n\r pairs are treated as a single line ending (but not \r\r \n\n)
    /// expects self.index and last_line_ending_index (if non-null) to contain line endings
    ///
    /// TODO: This is not really how the Win32 RC compiler handles line endings. Instead, it
    ///       seems to drop all carriage returns during preprocessing and then replace all
    ///       remaining line endings with well-formed CRLF pairs (e.g. `<CR>a<CR>b<LF>c` becomes `ab<CR><LF>c`).
    ///       Handling this the same as the Win32 RC compiler would need control over the preprocessor,
    ///       since Clang converts unpaired <CR> into unpaired <LF>.
    pub fn currentIndexFormsLineEndingPair(self: *const LineHandler, cur_index: usize) bool {
        if (self.last_line_ending_index == null) return false;

        // must immediately precede the current index, we know cur_index must
        // be >= 1 since last_line_ending_index is non-null (so if the subtraction
        // overflows it is a bug at the callsite of this function).
        if (self.last_line_ending_index.? != cur_index - 1) return false;

        const cur_line_ending = self.buffer[cur_index];
        const last_line_ending = self.buffer[self.last_line_ending_index.?];

        // sanity check
        std.debug.assert(cur_line_ending == '\r' or cur_line_ending == '\n');
        std.debug.assert(last_line_ending == '\r' or last_line_ending == '\n');

        // can't be \n\n or \r\r
        if (last_line_ending == cur_line_ending) return false;

        return true;
    }
};

pub const LexError = error{
    UnfinishedStringLiteral,
    StringLiteralTooLong,
    InvalidNumberWithExponent,
    InvalidDigitCharacterInNumberLiteral,
    IllegalByte,
    IllegalByteOutsideStringLiterals,
    IllegalCodepointOutsideStringLiterals,
    IllegalByteOrderMark,
    IllegalPrivateUseCharacter,
    FoundCStyleEscapedQuote,
    CodePagePragmaMissingLeftParen,
    CodePagePragmaMissingRightParen,
    /// Can be caught and ignored
    CodePagePragmaInvalidCodePage,
    CodePagePragmaNotInteger,
    CodePagePragmaOverflow,
    CodePagePragmaUnsupportedCodePage,
    /// Can be caught and ignored
    CodePagePragmaInIncludedFile,
};

pub const Lexer = struct {
    const Self = @This();

    buffer: []const u8,
    index: usize,
    line_handler: LineHandler,
    at_start_of_line: bool = true,
    error_context_token: ?Token = null,
    current_code_page: SupportedCodePage,
    default_code_page: SupportedCodePage,
    source_mappings: ?*SourceMappings,
    max_string_literal_codepoints: u15,
    /// Needed to determine whether or not the output code page should
    /// be set in the parser.
    seen_pragma_code_pages: u2 = 0,
    last_pragma_code_page_token: ?Token = null,

    pub const Error = LexError;

    pub const LexerOptions = struct {
        default_code_page: SupportedCodePage = .windows1252,
        source_mappings: ?*SourceMappings = null,
        max_string_literal_codepoints: u15 = default_max_string_literal_codepoints,
    };

    pub fn init(buffer: []const u8, options: LexerOptions) Self {
        return Self{
            .buffer = buffer,
            .index = 0,
            .current_code_page = options.default_code_page,
            .default_code_page = options.default_code_page,
            .source_mappings = options.source_mappings,
            .max_string_literal_codepoints = options.max_string_literal_codepoints,
            .line_handler = .{ .buffer = buffer },
        };
    }

    pub fn dump(self: *Self, token: *const Token) void {
        std.debug.print("{s}:{d}: {s}\n", .{ @tagName(token.id), token.line_number, std.fmt.fmtSliceEscapeLower(token.slice(self.buffer)) });
    }

    pub const LexMethod = enum {
        whitespace_delimiter_only,
        normal,
        normal_expect_operator,
    };

    pub fn next(self: *Self, comptime method: LexMethod) LexError!Token {
        switch (method) {
            .whitespace_delimiter_only => return self.nextWhitespaceDelimeterOnly(),
            .normal => return self.nextNormal(),
            .normal_expect_operator => return self.nextNormalWithContext(.expect_operator),
        }
    }

    const StateWhitespaceDelimiterOnly = enum {
        start,
        literal,
        preprocessor,
        semicolon,
    };

    pub fn nextWhitespaceDelimeterOnly(self: *Self) LexError!Token {
        const start_index = self.index;
        var result = Token{
            .id = .eof,
            .start = start_index,
            .end = undefined,
            .line_number = self.line_handler.line_number,
        };
        var state = StateWhitespaceDelimiterOnly.start;

        while (self.current_code_page.codepointAt(self.index, self.buffer)) |codepoint| : (self.index += codepoint.byte_len) {
            const c = codepoint.value;
            try self.checkForIllegalCodepoint(codepoint, false);
            switch (state) {
                .start => switch (c) {
                    '\r', '\n' => {
                        result.start = self.index + 1;
                        result.line_number = self.incrementLineNumber();
                    },
                    ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
                        result.start = self.index + 1;
                    },
                    // NBSP only counts as whitespace at the start of a line (but
                    // can be intermixed with other whitespace). Who knows why.
                    // TODO: This should either be removed, or it should also include
                    //       the codepoints listed in disjoint_code_page.zig
                    '\xA0' => if (self.at_start_of_line) {
                        result.start = self.index + codepoint.byte_len;
                    } else {
                        state = .literal;
                        self.at_start_of_line = false;
                    },
                    '#' => {
                        if (self.at_start_of_line) {
                            state = .preprocessor;
                        } else {
                            state = .literal;
                        }
                        self.at_start_of_line = false;
                    },
                    ';' => {
                        state = .semicolon;
                        self.at_start_of_line = false;
                    },
                    else => {
                        state = .literal;
                        self.at_start_of_line = false;
                    },
                },
                .literal => switch (c) {
                    '\r', '\n', ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
                        result.id = .literal;
                        break;
                    },
                    else => {},
                },
                .preprocessor => switch (c) {
                    '\r', '\n' => {
                        try self.evaluatePreprocessorCommand(result.start, self.index);
                        result.start = self.index + 1;
                        state = .start;
                        result.line_number = self.incrementLineNumber();
                    },
                    else => {},
                },
                .semicolon => switch (c) {
                    '\r', '\n' => {
                        result.start = self.index + 1;
                        state = .start;
                        result.line_number = self.incrementLineNumber();
                    },
                    else => {},
                },
            }
        } else { // got EOF
            switch (state) {
                .start => {},
                .semicolon => {
                    // Skip past everything up to the EOF
                    result.start = self.index;
                },
                .literal => {
                    result.id = .literal;
                },
                .preprocessor => {
                    try self.evaluatePreprocessorCommand(result.start, self.index);
                    result.start = self.index;
                },
            }
        }

        result.end = self.index;

        // EOF tokens must have their start index match the end index
        std.debug.assert(result.id != .eof or result.start == result.end);

        return result;
    }

    const StateNormal = enum {
        start,
        literal_or_quoted_wide_string,
        quoted_ascii_string,
        quoted_wide_string,
        quoted_ascii_string_escape,
        quoted_wide_string_escape,
        quoted_ascii_string_maybe_end,
        quoted_wide_string_maybe_end,
        literal,
        number_literal,
        preprocessor,
        semicolon,
        // end
        e,
        en,
        // begin
        b,
        be,
        beg,
        begi,
    };

    /// TODO: A not-terrible name
    pub fn nextNormal(self: *Self) LexError!Token {
        return self.nextNormalWithContext(.any);
    }

    pub fn nextNormalWithContext(self: *Self, context: enum { expect_operator, any }) LexError!Token {
        const start_index = self.index;
        var result = Token{
            .id = .eof,
            .start = start_index,
            .end = undefined,
            .line_number = self.line_handler.line_number,
        };
        var state = StateNormal.start;

        // Note: The Windows RC compiler uses a non-standard method of computing
        //       length for its 'string literal too long' errors; it isn't easily
        //       explained or intuitive (it's sort-of pre-parsed byte length but with
        //       a few of exceptions/edge cases).
        //
        // It also behaves strangely with non-ASCII codepoints, e.g. even though the default
        // limit is 4097, you can only have 4094 € codepoints (1 UTF-16 code unit each),
        // and 2048 𐐷 codepoints (2 UTF-16 code units each).
        //
        // TODO: Understand this more, bring it more in line with how the Win32 limits work.
        //       Alternatively, do something that makes more sense but may be more permissive.
        var string_literal_length: usize = 0;
        // Keeping track of the string literal column prevents pathological edge cases when
        // there are tons of tab stop characters within a string literal.
        var string_literal_column: usize = 0;
        var string_literal_collapsing_whitespace: bool = false;
        var still_could_have_exponent: bool = true;
        var exponent_index: ?usize = null;
        while (self.current_code_page.codepointAt(self.index, self.buffer)) |codepoint| : (self.index += codepoint.byte_len) {
            const c = codepoint.value;
            const in_string_literal = switch (state) {
                .quoted_ascii_string,
                .quoted_wide_string,
                .quoted_ascii_string_escape,
                .quoted_wide_string_escape,
                .quoted_ascii_string_maybe_end,
                .quoted_wide_string_maybe_end,
                =>
                // If the current line is not the same line as the start of the string literal,
                // then we want to treat the current codepoint as 'not in a string literal'
                // for the purposes of detecting illegal codepoints. This means that we will
                // error on illegal-outside-string-literal characters that are outside string
                // literals from the perspective of a C preprocessor, but that may be
                // inside string literals from the perspective of the RC lexer. For example,
                // "hello
                // @"
                // will be treated as a single string literal by the RC lexer but the Win32
                // preprocessor will consider this an unclosed string literal followed by
                // the character @ and ", and will therefore error since the Win32 RC preprocessor
                // errors on the @ character outside string literals.
                //
                // By doing this here, we can effectively emulate the Win32 RC preprocessor behavior
                // at lex-time, and avoid the need for a separate step that checks for this edge-case
                // specifically.
                result.line_number == self.line_handler.line_number,
                else => false,
            };
            try self.checkForIllegalCodepoint(codepoint, in_string_literal);
            switch (state) {
                .start => switch (c) {
                    '\r', '\n' => {
                        result.start = self.index + 1;
                        result.line_number = self.incrementLineNumber();
                    },
                    ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
                        result.start = self.index + 1;
                    },
                    // NBSP only counts as whitespace at the start of a line (but
                    // can be intermixed with other whitespace). Who knows why.
                    '\xA0' => if (self.at_start_of_line) {
                        result.start = self.index + codepoint.byte_len;
                    } else {
                        state = .literal;
                        self.at_start_of_line = false;
                    },
                    'L', 'l' => {
                        state = .literal_or_quoted_wide_string;
                        self.at_start_of_line = false;
                    },
                    'E', 'e' => {
                        state = .e;
                        self.at_start_of_line = false;
                    },
                    'B', 'b' => {
                        state = .b;
                        self.at_start_of_line = false;
                    },
                    '"' => {
                        state = .quoted_ascii_string;
                        self.at_start_of_line = false;
                        string_literal_collapsing_whitespace = false;
                        string_literal_length = 0;

                        var dummy_token = Token{
                            .start = self.index,
                            .end = self.index,
                            .line_number = self.line_handler.line_number,
                            .id = .invalid,
                        };
                        string_literal_column = dummy_token.calculateColumn(self.buffer, 8, null);
                    },
                    '+', '&', '|' => {
                        self.index += 1;
                        result.id = .operator;
                        self.at_start_of_line = false;
                        break;
                    },
                    '-' => {
                        if (context == .expect_operator) {
                            self.index += 1;
                            result.id = .operator;
                            self.at_start_of_line = false;
                            break;
                        } else {
                            state = .number_literal;
                            still_could_have_exponent = true;
                            exponent_index = null;
                            self.at_start_of_line = false;
                        }
                    },
                    '0'...'9', '~' => {
                        state = .number_literal;
                        still_could_have_exponent = true;
                        exponent_index = null;
                        self.at_start_of_line = false;
                    },
                    '#' => {
                        if (self.at_start_of_line) {
                            state = .preprocessor;
                        } else {
                            state = .literal;
                        }
                        self.at_start_of_line = false;
                    },
                    ';' => {
                        state = .semicolon;
                        self.at_start_of_line = false;
                    },
                    '{', '}' => {
                        self.index += 1;
                        result.id = if (c == '{') .begin else .end;
                        self.at_start_of_line = false;
                        break;
                    },
                    '(', ')' => {
                        self.index += 1;
                        result.id = if (c == '(') .open_paren else .close_paren;
                        self.at_start_of_line = false;
                        break;
                    },
                    ',' => {
                        self.index += 1;
                        result.id = .comma;
                        self.at_start_of_line = false;
                        break;
                    },
                    else => {
                        if (isNonAsciiDigit(c)) {
                            self.error_context_token = .{
                                .id = .number,
                                .start = result.start,
                                .end = self.index + 1,
                                .line_number = self.line_handler.line_number,
                            };
                            return error.InvalidDigitCharacterInNumberLiteral;
                        }
                        state = .literal;
                        self.at_start_of_line = false;
                    },
                },
                .preprocessor => switch (c) {
                    '\r', '\n' => {
                        try self.evaluatePreprocessorCommand(result.start, self.index);
                        result.start = self.index + 1;
                        state = .start;
                        result.line_number = self.incrementLineNumber();
                    },
                    else => {},
                },
                // Semi-colon acts as a line-terminator--everything is skipped until
                // the next line.
                .semicolon => switch (c) {
                    '\r', '\n' => {
                        result.start = self.index + 1;
                        state = .start;
                        result.line_number = self.incrementLineNumber();
                    },
                    else => {},
                },
                .number_literal => switch (c) {
                    // zig fmt: off
                    ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
                    '\r', '\n', '"', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
                    '\'', ';', '=',
                    => {
                    // zig fmt: on
                        result.id = .number;
                        break;
                    },
                    '0'...'9' => {
                        if (exponent_index) |exp_i| {
                            if (self.index - 1 == exp_i) {
                                // Note: This being an error is a quirk of the preprocessor used by
                                //       the Win32 RC compiler.
                                self.error_context_token = .{
                                    .id = .number,
                                    .start = result.start,
                                    .end = self.index + 1,
                                    .line_number = self.line_handler.line_number,
                                };
                                return error.InvalidNumberWithExponent;
                            }
                        }
                    },
                    'e', 'E' => {
                        if (still_could_have_exponent) {
                            exponent_index = self.index;
                            still_could_have_exponent = false;
                        }
                    },
                    else => {
                        if (isNonAsciiDigit(c)) {
                            self.error_context_token = .{
                                .id = .number,
                                .start = result.start,
                                .end = self.index + 1,
                                .line_number = self.line_handler.line_number,
                            };
                            return error.InvalidDigitCharacterInNumberLiteral;
                        }
                        still_could_have_exponent = false;
                    },
                },
                .literal_or_quoted_wide_string => switch (c) {
                    // zig fmt: off
                    ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
                    '\r', '\n', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
                    '\'', ';', '=',
                    // zig fmt: on
                    => {
                        result.id = .literal;
                        break;
                    },
                    '"' => {
                        state = .quoted_wide_string;
                        string_literal_collapsing_whitespace = false;
                        string_literal_length = 0;

                        var dummy_token = Token{
                            .start = self.index,
                            .end = self.index,
                            .line_number = self.line_handler.line_number,
                            .id = .invalid,
                        };
                        string_literal_column = dummy_token.calculateColumn(self.buffer, 8, null);
                    },
                    else => {
                        state = .literal;
                    },
                },
                .literal => switch (c) {
                    // zig fmt: off
                    ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
                    '\r', '\n', '"', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
                    '\'', ';', '=',
                    => {
                    // zig fmt: on
                        result.id = .literal;
                        break;
                    },
                    else => {},
                },
                .e => switch (c) {
                    'N', 'n' => {
                        state = .en;
                    },
                    else => {
                        state = .literal;
                        self.index -= 1;
                    },
                },
                .en => switch (c) {
                    'D', 'd' => {
                        result.id = .end;
                        self.index += 1;
                        break;
                    },
                    else => {
                        state = .literal;
                        self.index -= 1;
                    },
                },
                .b => switch (c) {
                    'E', 'e' => {
                        state = .be;
                    },
                    else => {
                        state = .literal;
                        self.index -= 1;
                    },
                },
                .be => switch (c) {
                    'G', 'g' => {
                        state = .beg;
                    },
                    else => {
                        state = .literal;
                        self.index -= 1;
                    },
                },
                .beg => switch (c) {
                    'I', 'i' => {
                        state = .begi;
                    },
                    else => {
                        state = .literal;
                        self.index -= 1;
                    },
                },
                .begi => switch (c) {
                    'N', 'n' => {
                        result.id = .begin;
                        self.index += 1;
                        break;
                    },
                    else => {
                        state = .literal;
                        self.index -= 1;
                    },
                },
                .quoted_ascii_string, .quoted_wide_string => switch (c) {
                    '"' => {
                        string_literal_column += 1;
                        state = if (state == .quoted_ascii_string) .quoted_ascii_string_maybe_end else .quoted_wide_string_maybe_end;
                    },
                    '\\' => {
                        string_literal_length += 1;
                        string_literal_column += 1;
                        state = if (state == .quoted_ascii_string) .quoted_ascii_string_escape else .quoted_wide_string_escape;
                    },
                    '\r' => {
                        string_literal_column = 0;
                        // \r doesn't count towards string literal length

                        // Increment line number but don't affect the result token's line number
                        _ = self.incrementLineNumber();
                    },
                    '\n' => {
                        string_literal_column = 0;
                        // first \n expands to <space><\n>
                        if (!string_literal_collapsing_whitespace) {
                            string_literal_length += 2;
                            string_literal_collapsing_whitespace = true;
                        }
                        // the rest are collapsed into the <space><\n>

                        // Increment line number but don't affect the result token's line number
                        _ = self.incrementLineNumber();
                    },
                    // only \t, space, Vertical Tab, and Form Feed count as whitespace when collapsing
                    '\t', ' ', '\x0b', '\x0c' => {
                        if (!string_literal_collapsing_whitespace) {
                            // Literal tab characters are counted as the number of space characters
                            // needed to reach the next 8-column tab stop.
                            const width = columnWidth(string_literal_column, @intCast(c), 8);
                            string_literal_length += width;
                            string_literal_column += width;
                        }
                    },
                    else => {
                        string_literal_collapsing_whitespace = false;
                        string_literal_length += 1;
                        string_literal_column += 1;
                    },
                },
                .quoted_ascii_string_escape, .quoted_wide_string_escape => switch (c) {
                    '"' => {
                        self.error_context_token = .{
                            .id = .invalid,
                            .start = self.index - 1,
                            .end = self.index + 1,
                            .line_number = self.line_handler.line_number,
                        };
                        return error.FoundCStyleEscapedQuote;
                    },
                    else => {
                        string_literal_length += 1;
                        string_literal_column += 1;
                        state = if (state == .quoted_ascii_string_escape) .quoted_ascii_string else .quoted_wide_string;
                    },
                },
                .quoted_ascii_string_maybe_end, .quoted_wide_string_maybe_end => switch (c) {
                    '"' => {
                        state = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
                        // Escaped quotes count as 1 char for string literal length checks.
                        // Since we did not increment on the first " (because it could have been
                        // the end of the quoted string), we increment here
                        string_literal_length += 1;
                        string_literal_column += 1;
                    },
                    else => {
                        result.id = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
                        break;
                    },
                },
            }
        } else { // got EOF
            switch (state) {
                .start => {},
                .semicolon => {
                    // Skip past everything up to the EOF
                    result.start = self.index;
                },
                .literal_or_quoted_wide_string, .literal, .e, .en, .b, .be, .beg, .begi => {
                    result.id = .literal;
                },
                .preprocessor => {
                    try self.evaluatePreprocessorCommand(result.start, self.index);
                    result.start = self.index;
                },
                .number_literal => {
                    result.id = .number;
                },
                .quoted_ascii_string_maybe_end, .quoted_wide_string_maybe_end => {
                    result.id = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
                },
                .quoted_ascii_string,
                .quoted_wide_string,
                .quoted_ascii_string_escape,
                .quoted_wide_string_escape,
                => {
                    self.error_context_token = .{
                        .id = .eof,
                        .start = self.index,
                        .end = self.index,
                        .line_number = self.line_handler.line_number,
                    };
                    return LexError.UnfinishedStringLiteral;
                },
            }
        }

        result.end = self.index;

        if (result.id == .quoted_ascii_string or result.id == .quoted_wide_string) {
            if (string_literal_length > self.max_string_literal_codepoints) {
                self.error_context_token = result;
                return LexError.StringLiteralTooLong;
            }
        }

        // EOF tokens must have their start index match the end index
        std.debug.assert(result.id != .eof or result.start == result.end);

        return result;
    }

    /// Increments line_number appropriately (handling line ending pairs)
    /// and returns the new line number.
    fn incrementLineNumber(self: *Self) usize {
        _ = self.line_handler.incrementLineNumber(self.index);
        self.at_start_of_line = true;
        return self.line_handler.line_number;
    }

    fn checkForIllegalCodepoint(self: *Self, codepoint: code_pages.Codepoint, in_string_literal: bool) LexError!void {
        const err = switch (codepoint.value) {
            // 0x00 = NUL
            // 0x1A = Substitute (treated as EOF)
            // NOTE: 0x1A gets treated as EOF by the clang preprocessor so after a .rc file
            //       is run through the clang preprocessor it will no longer have 0x1A characters in it.
            // 0x7F = DEL (treated as a context-specific terminator by the Windows RC compiler)
            0x00, 0x1A, 0x7F => error.IllegalByte,
            // 0x01...0x03 result in strange 'macro definition too big' errors when used outside of string literals
            // 0x04 is valid but behaves strangely (sort of acts as a 'skip the next character' instruction)
            0x01...0x04 => if (!in_string_literal) error.IllegalByteOutsideStringLiterals else return,
            // @ and ` both result in error RC2018: unknown character '0x60' (and subsequently
            // fatal error RC1116: RC terminating after preprocessor errors) if they are ever used
            // outside of string literals. Not exactly sure why this would be the case, though.
            // TODO: Make sure there aren't any exceptions
            '@', '`' => if (!in_string_literal) error.IllegalByteOutsideStringLiterals else return,
            // The Byte Order Mark is mostly skipped over by the Windows RC compiler, but
            // there are edge cases where it leads to cryptic 'compiler limit : macro definition too big'
            // errors (e.g. a BOM within a number literal). By making this illegal we avoid having to
            // deal with a lot of edge cases and remove the potential footgun of the bytes of a BOM
            // being 'missing' when included in a string literal (the Windows RC compiler acts as
            // if the codepoint was never part of the string literal).
            '\u{FEFF}' => error.IllegalByteOrderMark,
            // Similar deal with this private use codepoint, it gets skipped/ignored by the
            // RC compiler (but without the cryptic errors). Silently dropping bytes still seems like
            // enough of a footgun with no real use-cases that it's still worth erroring instead of
            // emulating the RC compiler's behavior, though.
            '\u{E000}' => error.IllegalPrivateUseCharacter,
            // These codepoints lead to strange errors when used outside of string literals,
            // and miscompilations when used within string literals. We avoid the miscompilation
            // within string literals and emit a warning, but outside of string literals it makes
            // more sense to just disallow these codepoints.
            0x900, 0xA00, 0xA0D, 0x2000, 0xD00, 0xFFFE, 0xFFFF => if (!in_string_literal) error.IllegalCodepointOutsideStringLiterals else return,
            else => return,
        };
        self.error_context_token = .{
            .id = .invalid,
            .start = self.index,
            .end = self.index + codepoint.byte_len,
            .line_number = self.line_handler.line_number,
        };
        return err;
    }

    fn evaluatePreprocessorCommand(self: *Self, start: usize, end: usize) !void {
        const token = Token{
            .id = .preprocessor_command,
            .start = start,
            .end = end,
            .line_number = self.line_handler.line_number,
        };
        errdefer self.error_context_token = token;
        const full_command = self.buffer[start..end];

        const code_page = (parsePragmaCodePage(full_command) catch |err| switch (err) {
            error.NotPragma, error.NotCodePagePragma => return,
            else => |e| return e,
        }) orelse self.default_code_page;

        // https://learn.microsoft.com/en-us/windows/win32/menurc/pragma-directives
        // > This pragma is not supported in an included resource file (.rc)
        //
        // Even though the Win32 behavior is to just ignore such directives silently,
        // this is an error in the lexer to allow for emitting warnings/errors when
        // such directives are found if that's wanted. The intention is for the lexer
        // to still be able to work correctly after this error is returned.
        if (self.source_mappings) |source_mappings| {
            if (!source_mappings.isRootFile(token.line_number)) {
                return error.CodePagePragmaInIncludedFile;
            }
        }

        self.seen_pragma_code_pages +|= 1;
        self.last_pragma_code_page_token = token;
        self.current_code_page = code_page;
    }

    pub fn getErrorDetails(self: Self, lex_err: LexError) ErrorDetails {
        const err = switch (lex_err) {
            error.UnfinishedStringLiteral => ErrorDetails.Error.unfinished_string_literal,
            error.StringLiteralTooLong => return .{
                .err = .string_literal_too_long,
                .code_page = self.current_code_page,
                .token = self.error_context_token.?,
                .extra = .{ .number = self.max_string_literal_codepoints },
            },
            error.InvalidNumberWithExponent => ErrorDetails.Error.invalid_number_with_exponent,
            error.InvalidDigitCharacterInNumberLiteral => ErrorDetails.Error.invalid_digit_character_in_number_literal,
            error.IllegalByte => ErrorDetails.Error.illegal_byte,
            error.IllegalByteOutsideStringLiterals => ErrorDetails.Error.illegal_byte_outside_string_literals,
            error.IllegalCodepointOutsideStringLiterals => ErrorDetails.Error.illegal_codepoint_outside_string_literals,
            error.IllegalByteOrderMark => ErrorDetails.Error.illegal_byte_order_mark,
            error.IllegalPrivateUseCharacter => ErrorDetails.Error.illegal_private_use_character,
            error.FoundCStyleEscapedQuote => ErrorDetails.Error.found_c_style_escaped_quote,
            error.CodePagePragmaMissingLeftParen => ErrorDetails.Error.code_page_pragma_missing_left_paren,
            error.CodePagePragmaMissingRightParen => ErrorDetails.Error.code_page_pragma_missing_right_paren,
            error.CodePagePragmaInvalidCodePage => ErrorDetails.Error.code_page_pragma_invalid_code_page,
            error.CodePagePragmaNotInteger => ErrorDetails.Error.code_page_pragma_not_integer,
            error.CodePagePragmaOverflow => ErrorDetails.Error.code_page_pragma_overflow,
            error.CodePagePragmaUnsupportedCodePage => ErrorDetails.Error.code_page_pragma_unsupported_code_page,
            error.CodePagePragmaInIncludedFile => ErrorDetails.Error.code_page_pragma_in_included_file,
        };
        return .{
            .err = err,
            .code_page = self.current_code_page,
            .token = self.error_context_token.?,
        };
    }
};

fn parseCodePageNum(str: []const u8) !u32 {
    var x: u32 = 0;
    for (str) |c| {
        const digit = try std.fmt.charToDigit(c, 10);
        if (x != 0) x = try std.math.mul(u32, x, 10);
        x = try std.math.add(u32, x, digit);
    }
    return x;
}

/// Returns `null` when the code_page is set to DEFAULT
pub fn parsePragmaCodePage(full_command: []const u8) !?SupportedCodePage {
    var command = full_command;

    // Anything besides exactly this is ignored by the Windows RC implementation
    const expected_directive = "#pragma";
    if (!std.mem.startsWith(u8, command, expected_directive)) return error.NotPragma;
    command = command[expected_directive.len..];

    if (command.len == 0 or !std.ascii.isWhitespace(command[0])) return error.NotCodePagePragma;
    while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
        command = command[1..];
    }

    // Note: CoDe_PaGeZ is also treated as "code_page" by the Windows RC implementation,
    //       and it will error with 'Missing left parenthesis in code_page #pragma'
    const expected_extension = "code_page";
    if (!std.ascii.startsWithIgnoreCase(command, expected_extension)) return error.NotCodePagePragma;
    command = command[expected_extension.len..];

    while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
        command = command[1..];
    }

    if (command.len == 0 or command[0] != '(') {
        return error.CodePagePragmaMissingLeftParen;
    }
    command = command[1..];

    while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
        command = command[1..];
    }

    var num_str: []u8 = command[0..0];
    while (command.len > 0 and (command[0] != ')' and !std.ascii.isWhitespace(command[0]))) {
        command = command[1..];
        num_str.len += 1;
    }

    if (num_str.len == 0) {
        return error.CodePagePragmaNotInteger;
    }

    while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
        command = command[1..];
    }

    if (command.len == 0 or command[0] != ')') {
        return error.CodePagePragmaMissingRightParen;
    }

    const code_page: ?SupportedCodePage = code_page: {
        if (std.ascii.eqlIgnoreCase("DEFAULT", num_str)) {
            break :code_page null;
        }

        // The Win32 compiler behaves fairly strangely around maxInt(u32):
        // - If the overflowed u32 wraps and becomes a known code page ID, then
        //   it will error/warn with "Codepage not valid:  ignored" (depending on /w)
        // - If the overflowed u32 wraps and does not become a known code page ID,
        //   then it will error with 'constant too big' and 'Codepage not integer'
        //
        // Instead of that, we just have a separate error specifically for overflow.
        const num = parseCodePageNum(num_str) catch |err| switch (err) {
            error.InvalidCharacter => return error.CodePagePragmaNotInteger,
            error.Overflow => return error.CodePagePragmaOverflow,
        };

        // Anything that starts with 0 but does not resolve to 0 is treated as invalid, e.g. 01252
        if (num_str[0] == '0' and num != 0) {
            return error.CodePagePragmaInvalidCodePage;
        }
        // Anything that resolves to 0 is treated as 'not an integer' by the Win32 implementation.
        else if (num == 0) {
            return error.CodePagePragmaNotInteger;
        }
        // Anything above u16 max is not going to be found since our CodePage enum is backed by a u16.
        if (num > std.math.maxInt(u16)) {
            return error.CodePagePragmaInvalidCodePage;
        }

        break :code_page code_pages.getByIdentifierEnsureSupported(@intCast(num)) catch |err| switch (err) {
            error.InvalidCodePage => return error.CodePagePragmaInvalidCodePage,
            error.UnsupportedCodePage => return error.CodePagePragmaUnsupportedCodePage,
        };
    };

    return code_page;
}

fn testLexNormal(source: []const u8, expected_tokens: []const Token.Id) !void {
    var lexer = Lexer.init(source, .{});
    if (dumpTokensDuringTests) std.debug.print("\n----------------------\n{s}\n----------------------\n", .{lexer.buffer});
    for (expected_tokens) |expected_token_id| {
        const token = try lexer.nextNormal();
        if (dumpTokensDuringTests) lexer.dump(&token);
        try std.testing.expectEqual(expected_token_id, token.id);
    }
    const last_token = try lexer.nextNormal();
    try std.testing.expectEqual(Token.Id.eof, last_token.id);
}

fn expectLexError(expected: LexError, actual: anytype) !void {
    try std.testing.expectError(expected, actual);
    if (dumpTokensDuringTests) std.debug.print("{!}\n", .{actual});
}

test "normal: numbers" {
    try testLexNormal("1", &.{.number});
    try testLexNormal("-1", &.{.number});
    try testLexNormal("- 1", &.{ .number, .number });
    try testLexNormal("-a", &.{.number});
}

test "normal: string literals" {
    try testLexNormal("\"\"", &.{.quoted_ascii_string});
    // "" is an escaped "
    try testLexNormal("\" \"\" \"", &.{.quoted_ascii_string});
}

test "superscript chars and code pages" {
    const firstToken = struct {
        pub fn firstToken(source: []const u8, default_code_page: SupportedCodePage, comptime lex_method: Lexer.LexMethod) LexError!Token {
            var lexer = Lexer.init(source, .{ .default_code_page = default_code_page });
            return lexer.next(lex_method);
        }
    }.firstToken;
    const utf8_source = "²";
    const windows1252_source = "\xB2";

    const windows1252_encoded_as_windows1252 = firstToken(windows1252_source, .windows1252, .normal);
    try std.testing.expectError(error.InvalidDigitCharacterInNumberLiteral, windows1252_encoded_as_windows1252);

    const utf8_encoded_as_windows1252 = try firstToken(utf8_source, .windows1252, .normal);
    try std.testing.expectEqual(Token{
        .id = .literal,
        .start = 0,
        .end = 2,
        .line_number = 1,
    }, utf8_encoded_as_windows1252);

    const utf8_encoded_as_utf8 = firstToken(utf8_source, .utf8, .normal);
    try std.testing.expectError(error.InvalidDigitCharacterInNumberLiteral, utf8_encoded_as_utf8);

    const windows1252_encoded_as_utf8 = try firstToken(windows1252_source, .utf8, .normal);
    try std.testing.expectEqual(Token{
        .id = .literal,
        .start = 0,
        .end = 1,
        .line_number = 1,
    }, windows1252_encoded_as_utf8);
}