diff --git a/lib/std/json/scanner.zig b/lib/std/json/scanner.zig index 54b661113b..85a058af38 100644 --- a/lib/std/json/scanner.zig +++ b/lib/std/json/scanner.zig @@ -897,7 +897,7 @@ pub const Scanner = struct { }, .number_post_dot => { if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false); - switch (try self.expectByte()) { + switch (self.input[self.cursor]) { '0'...'9' => { self.cursor += 1; self.state = .number_frac; @@ -1032,7 +1032,8 @@ pub const Scanner = struct { return error.BufferUnderrun; }, .string_backslash => { - switch (try self.expectByte()) { + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + switch (self.input[self.cursor]) { '"', '\\', '/' => { // Since these characters now represent themselves literally, // we can simply begin the next plaintext slice here. @@ -1080,7 +1081,8 @@ pub const Scanner = struct { } }, .string_backslash_u => { - const c = try self.expectByte(); + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + const c = self.input[self.cursor]; switch (c) { '0'...'9' => { self.utf16_code_units[0] = @as(u16, c - '0') << 12; @@ -1098,7 +1100,8 @@ pub const Scanner = struct { continue :state_loop; }, .string_backslash_u_1 => { - const c = try self.expectByte(); + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + const c = self.input[self.cursor]; switch (c) { '0'...'9' => { self.utf16_code_units[0] |= @as(u16, c - '0') << 8; @@ -1116,7 +1119,8 @@ pub const Scanner = struct { continue :state_loop; }, .string_backslash_u_2 => { - const c = try self.expectByte(); + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + const c = self.input[self.cursor]; switch (c) { '0'...'9' => { self.utf16_code_units[0] |= @as(u16, c - '0') << 4; @@ -1134,7 +1138,8 @@ pub const Scanner = struct { continue :state_loop; }, .string_backslash_u_3 => { - const c = try self.expectByte(); + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + const c = self.input[self.cursor]; switch (c) { '0'...'9' => { self.utf16_code_units[0] |= c - '0'; @@ -1160,7 +1165,8 @@ pub const Scanner = struct { } }, .string_surrogate_half => { - switch (try self.expectByte()) { + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + switch (self.input[self.cursor]) { '\\' => { self.cursor += 1; self.state = .string_surrogate_half_backslash; @@ -1170,7 +1176,8 @@ pub const Scanner = struct { } }, .string_surrogate_half_backslash => { - switch (try self.expectByte()) { + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + switch (self.input[self.cursor]) { 'u' => { self.cursor += 1; self.state = .string_surrogate_half_backslash_u; @@ -1180,7 +1187,8 @@ pub const Scanner = struct { } }, .string_surrogate_half_backslash_u => { - switch (try self.expectByte()) { + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + switch (self.input[self.cursor]) { 'D', 'd' => { self.cursor += 1; self.utf16_code_units[1] = 0xD << 12; @@ -1191,7 +1199,8 @@ pub const Scanner = struct { } }, .string_surrogate_half_backslash_u_1 => { - const c = try self.expectByte(); + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + const c = self.input[self.cursor]; switch (c) { 'C'...'F' => { self.cursor += 1; @@ -1209,7 +1218,8 @@ pub const Scanner = struct { } }, .string_surrogate_half_backslash_u_2 => { - const c = try self.expectByte(); + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + const c = self.input[self.cursor]; switch (c) { '0'...'9' => { self.cursor += 1; @@ -1233,7 +1243,8 @@ pub const Scanner = struct { } }, .string_surrogate_half_backslash_u_3 => { - const c = try self.expectByte(); + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + const c = self.input[self.cursor]; switch (c) { '0'...'9' => { self.utf16_code_units[1] |= c - '0'; @@ -1254,7 +1265,8 @@ pub const Scanner = struct { }, .string_utf8_last_byte => { - switch (try self.expectByte()) { + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + switch (self.input[self.cursor]) { 0x80...0xBF => { self.cursor += 1; self.state = .string; @@ -1264,7 +1276,8 @@ pub const Scanner = struct { } }, .string_utf8_second_to_last_byte => { - switch (try self.expectByte()) { + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + switch (self.input[self.cursor]) { 0x80...0xBF => { self.cursor += 1; self.state = .string_utf8_last_byte; @@ -1274,7 +1287,8 @@ pub const Scanner = struct { } }, .string_utf8_second_to_last_byte_guard_against_overlong => { - switch (try self.expectByte()) { + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + switch (self.input[self.cursor]) { 0xA0...0xBF => { self.cursor += 1; self.state = .string_utf8_last_byte; @@ -1284,7 +1298,8 @@ pub const Scanner = struct { } }, .string_utf8_second_to_last_byte_guard_against_surrogate_half => { - switch (try self.expectByte()) { + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + switch (self.input[self.cursor]) { 0x80...0x9F => { self.cursor += 1; self.state = .string_utf8_last_byte; @@ -1294,7 +1309,8 @@ pub const Scanner = struct { } }, .string_utf8_third_to_last_byte => { - switch (try self.expectByte()) { + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + switch (self.input[self.cursor]) { 0x80...0xBF => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte; @@ -1304,7 +1320,8 @@ pub const Scanner = struct { } }, .string_utf8_third_to_last_byte_guard_against_overlong => { - switch (try self.expectByte()) { + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + switch (self.input[self.cursor]) { 0x90...0xBF => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte; @@ -1314,7 +1331,8 @@ pub const Scanner = struct { } }, .string_utf8_third_to_last_byte_guard_against_too_large => { - switch (try self.expectByte()) { + if (self.cursor >= self.input.len) return self.endOfBufferInString(); + switch (self.input[self.cursor]) { 0x80...0x8F => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte; @@ -1666,6 +1684,17 @@ pub const Scanner = struct { self.value_start = self.cursor; return slice; } + fn takeValueSliceMinusTrailingOffset(self: *@This(), trailing_negative_offset: usize) []const u8 { + // Check if the escape sequence started before the current input buffer. + // (The algebra here is awkward to avoid unsigned underflow, + // but it's just making sure the slice on the next line isn't UB.) + if (self.cursor <= self.value_start + trailing_negative_offset) return ""; + const slice = self.input[self.value_start .. self.cursor - trailing_negative_offset]; + // When trailing_negative_offset is non-zero, setting self.value_start doesn't matter, + // because we always set it again while emitting the .partial_string_escaped_*. + self.value_start = self.cursor; + return slice; + } fn endOfBufferInNumber(self: *@This(), allow_end: bool) !Token { const slice = self.takeValueSlice(); @@ -1678,6 +1707,39 @@ pub const Scanner = struct { return Token{ .partial_number = slice }; } + fn endOfBufferInString(self: *@This()) !Token { + if (self.is_end_of_input) return error.UnexpectedEndOfInput; + const slice = self.takeValueSliceMinusTrailingOffset(switch (self.state) { + // Don't include the escape sequence in the partial string. + .string_backslash => 1, + .string_backslash_u => 2, + .string_backslash_u_1 => 3, + .string_backslash_u_2 => 4, + .string_backslash_u_3 => 5, + .string_surrogate_half => 6, + .string_surrogate_half_backslash => 7, + .string_surrogate_half_backslash_u => 8, + .string_surrogate_half_backslash_u_1 => 9, + .string_surrogate_half_backslash_u_2 => 10, + .string_surrogate_half_backslash_u_3 => 11, + + // Include everything up to the cursor otherwise. + .string, + .string_utf8_last_byte, + .string_utf8_second_to_last_byte, + .string_utf8_second_to_last_byte_guard_against_overlong, + .string_utf8_second_to_last_byte_guard_against_surrogate_half, + .string_utf8_third_to_last_byte, + .string_utf8_third_to_last_byte_guard_against_overlong, + .string_utf8_third_to_last_byte_guard_against_too_large, + => 0, + + else => unreachable, + }); + if (slice.len == 0) return error.BufferUnderrun; + return Token{ .partial_string = slice }; + } + fn partialStringCodepoint(code_point: u21) Token { var buf: [4]u8 = undefined; switch (std.unicode.utf8Encode(code_point, &buf) catch unreachable) {