mirror of
https://github.com/ziglang/zig.git
synced 2026-02-13 21:08:36 +00:00
Merge pull request #3648 from xackus/json-unescape
breaking: JSON unescape
This commit is contained in:
commit
54231e832b
169
lib/std/json.zig
169
lib/std/json.zig
@ -10,18 +10,18 @@ const maxInt = std.math.maxInt;
|
||||
|
||||
pub const WriteStream = @import("json/write_stream.zig").WriteStream;
|
||||
|
||||
// A single token slice into the parent string.
|
||||
//
|
||||
// Use `token.slice()` on the input at the current position to get the current slice.
|
||||
/// A single token slice into the parent string.
|
||||
///
|
||||
/// Use `token.slice()` on the input at the current position to get the current slice.
|
||||
pub const Token = struct {
|
||||
id: Id,
|
||||
// How many bytes do we skip before counting
|
||||
/// How many bytes do we skip before counting
|
||||
offset: u1,
|
||||
// Whether string contains a \uXXXX sequence and cannot be zero-copied
|
||||
/// Whether string contains an escape sequence and cannot be zero-copied
|
||||
string_has_escape: bool,
|
||||
// Whether number is simple and can be represented by an integer (i.e. no `.` or `e`)
|
||||
/// Whether number is simple and can be represented by an integer (i.e. no `.` or `e`)
|
||||
number_is_integer: bool,
|
||||
// How many bytes from the current position behind the start of this token is.
|
||||
/// How many bytes from the current position behind the start of this token is.
|
||||
count: usize,
|
||||
|
||||
pub const Id = enum {
|
||||
@ -66,7 +66,7 @@ pub const Token = struct {
|
||||
};
|
||||
}
|
||||
|
||||
// A marker token is a zero-length
|
||||
/// A marker token is a zero-length
|
||||
pub fn initMarker(id: Id) Token {
|
||||
return Token{
|
||||
.id = id,
|
||||
@ -77,19 +77,19 @@ pub const Token = struct {
|
||||
};
|
||||
}
|
||||
|
||||
// Slice into the underlying input string.
|
||||
/// Slice into the underlying input string.
|
||||
pub fn slice(self: Token, input: []const u8, i: usize) []const u8 {
|
||||
return input[i + self.offset - self.count .. i + self.offset];
|
||||
}
|
||||
};
|
||||
|
||||
// A small streaming JSON parser. This accepts input one byte at a time and returns tokens as
|
||||
// they are encountered. No copies or allocations are performed during parsing and the entire
|
||||
// parsing state requires ~40-50 bytes of stack space.
|
||||
//
|
||||
// Conforms strictly to RFC8529.
|
||||
//
|
||||
// For a non-byte based wrapper, consider using TokenStream instead.
|
||||
/// A small streaming JSON parser. This accepts input one byte at a time and returns tokens as
|
||||
/// they are encountered. No copies or allocations are performed during parsing and the entire
|
||||
/// parsing state requires ~40-50 bytes of stack space.
|
||||
///
|
||||
/// Conforms strictly to RFC8529.
|
||||
///
|
||||
/// For a non-byte based wrapper, consider using TokenStream instead.
|
||||
pub const StreamingParser = struct {
|
||||
// Current state
|
||||
state: State,
|
||||
@ -205,10 +205,10 @@ pub const StreamingParser = struct {
|
||||
InvalidControlCharacter,
|
||||
};
|
||||
|
||||
// Give another byte to the parser and obtain any new tokens. This may (rarely) return two
|
||||
// tokens. token2 is always null if token1 is null.
|
||||
//
|
||||
// There is currently no error recovery on a bad stream.
|
||||
/// Give another byte to the parser and obtain any new tokens. This may (rarely) return two
|
||||
/// tokens. token2 is always null if token1 is null.
|
||||
///
|
||||
/// There is currently no error recovery on a bad stream.
|
||||
pub fn feed(p: *StreamingParser, c: u8, token1: *?Token, token2: *?Token) Error!void {
|
||||
token1.* = null;
|
||||
token2.* = null;
|
||||
@ -866,7 +866,7 @@ pub const StreamingParser = struct {
|
||||
}
|
||||
};
|
||||
|
||||
// A small wrapper over a StreamingParser for full slices. Returns a stream of json Tokens.
|
||||
/// A small wrapper over a StreamingParser for full slices. Returns a stream of json Tokens.
|
||||
pub const TokenStream = struct {
|
||||
i: usize,
|
||||
slice: []const u8,
|
||||
@ -905,7 +905,13 @@ pub const TokenStream = struct {
|
||||
}
|
||||
}
|
||||
|
||||
if (self.parser.complete) {
|
||||
// Without this a bare number fails, becasue the streaming parser doesn't know it ended
|
||||
try self.parser.feed(' ', &t1, &t2);
|
||||
self.i += 1;
|
||||
|
||||
if (t1) |token| {
|
||||
return token;
|
||||
} else if (self.parser.complete) {
|
||||
return null;
|
||||
} else {
|
||||
return error.UnexpectedEndOfJson;
|
||||
@ -971,8 +977,8 @@ test "json.token" {
|
||||
testing.expect((try p.next()) == null);
|
||||
}
|
||||
|
||||
// Validate a JSON string. This does not limit number precision so a decoder may not necessarily
|
||||
// be able to decode the string even if this returns true.
|
||||
/// Validate a JSON string. This does not limit number precision so a decoder may not necessarily
|
||||
/// be able to decode the string even if this returns true.
|
||||
pub fn validate(s: []const u8) bool {
|
||||
var p = StreamingParser.init();
|
||||
|
||||
@ -1009,6 +1015,8 @@ pub const ValueTree = struct {
|
||||
pub const ObjectMap = StringHashMap(Value);
|
||||
pub const Array = ArrayList(Value);
|
||||
|
||||
/// Represents a JSON value
|
||||
/// Currently only supports numbers that fit into i64 or f64.
|
||||
pub const Value = union(enum) {
|
||||
Null,
|
||||
Bool: bool,
|
||||
@ -1055,7 +1063,7 @@ pub const Value = union(enum) {
|
||||
}
|
||||
};
|
||||
|
||||
// A non-stream JSON parser which constructs a tree of Value's.
|
||||
/// A non-stream JSON parser which constructs a tree of Value's.
|
||||
pub const Parser = struct {
|
||||
allocator: *Allocator,
|
||||
state: State,
|
||||
@ -1124,7 +1132,10 @@ pub const Parser = struct {
|
||||
p.state = State.ObjectValue;
|
||||
},
|
||||
else => {
|
||||
unreachable;
|
||||
// The streaming parser would return an error eventually.
|
||||
// To prevent invalid state we return an error now.
|
||||
// TODO make the streaming parser return an error as soon as it encounters an invalid object key
|
||||
return error.InvalidLiteral;
|
||||
},
|
||||
},
|
||||
State.ObjectValue => {
|
||||
@ -1266,7 +1277,7 @@ pub const Parser = struct {
|
||||
// TODO: We don't strictly have to copy values which do not contain any escape
|
||||
// characters if flagged with the option.
|
||||
const slice = token.slice(input, i);
|
||||
return Value{ .String = try mem.dupe(allocator, u8, slice) };
|
||||
return Value{ .String = try unescapeStringAlloc(allocator, slice) };
|
||||
}
|
||||
|
||||
fn parseNumber(p: *Parser, token: Token, input: []const u8, i: usize) !Value {
|
||||
@ -1277,6 +1288,77 @@ pub const Parser = struct {
|
||||
}
|
||||
};
|
||||
|
||||
// Unescape a JSON string
|
||||
// Only to be used on strings already validated by the parser
|
||||
// (note the unreachable statements and lack of bounds checking)
|
||||
// Optimized for arena allocators, uses Allocator.shrink
|
||||
//
|
||||
// Idea: count how many bytes we will need to allocate in the streaming parser and store it
|
||||
// in the token to avoid allocating too much memory or iterating through the string again
|
||||
// Downside: need to find how many bytes a unicode escape sequence will produce twice
|
||||
fn unescapeStringAlloc(alloc: *Allocator, input: []const u8) ![]u8 {
|
||||
const output = try alloc.alloc(u8, input.len);
|
||||
errdefer alloc.free(output);
|
||||
|
||||
var inIndex: usize = 0;
|
||||
var outIndex: usize = 0;
|
||||
|
||||
while(inIndex < input.len) {
|
||||
if(input[inIndex] != '\\'){
|
||||
// not an escape sequence
|
||||
output[outIndex] = input[inIndex];
|
||||
inIndex += 1;
|
||||
outIndex += 1;
|
||||
} else if(input[inIndex + 1] != 'u'){
|
||||
// a simple escape sequence
|
||||
output[outIndex] = @as(u8,
|
||||
switch(input[inIndex + 1]){
|
||||
'\\' => '\\',
|
||||
'/' => '/',
|
||||
'n' => '\n',
|
||||
'r' => '\r',
|
||||
't' => '\t',
|
||||
'f' => 12,
|
||||
'b' => 8,
|
||||
'"' => '"',
|
||||
else => unreachable
|
||||
}
|
||||
);
|
||||
inIndex += 2;
|
||||
outIndex += 1;
|
||||
} else {
|
||||
// a unicode escape sequence
|
||||
const firstCodeUnit = std.fmt.parseInt(u16, input[inIndex+2 .. inIndex+6], 16) catch unreachable;
|
||||
|
||||
// guess optimistically that it's not a surrogate pair
|
||||
if(std.unicode.utf8Encode(firstCodeUnit, output[outIndex..])) |byteCount| {
|
||||
outIndex += byteCount;
|
||||
inIndex += 6;
|
||||
} else |err| {
|
||||
// it might be a surrogate pair
|
||||
if(err != error.Utf8CannotEncodeSurrogateHalf) {
|
||||
return error.InvalidUnicodeHexSymbol;
|
||||
}
|
||||
// check if a second code unit is present
|
||||
if(inIndex + 7 >= input.len or input[inIndex + 6] != '\\' or input[inIndex + 7] != 'u'){
|
||||
return error.InvalidUnicodeHexSymbol;
|
||||
}
|
||||
|
||||
const secondCodeUnit = std.fmt.parseInt(u16, input[inIndex+8 .. inIndex+12], 16) catch unreachable;
|
||||
|
||||
if(std.unicode.utf16leToUtf8(output[outIndex..], [2]u16{ firstCodeUnit, secondCodeUnit })) |byteCount| {
|
||||
outIndex += byteCount;
|
||||
inIndex += 12;
|
||||
} else |_| {
|
||||
return error.InvalidUnicodeHexSymbol;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return alloc.shrink(output, outIndex);
|
||||
}
|
||||
|
||||
test "json.parser.dynamic" {
|
||||
var p = Parser.init(debug.global_allocator, false);
|
||||
defer p.deinit();
|
||||
@ -1399,3 +1481,36 @@ test "integer after float has proper type" {
|
||||
);
|
||||
std.testing.expect(json.Object.getValue("ints").?.Array.at(0) == .Integer);
|
||||
}
|
||||
|
||||
test "escaped characters" {
|
||||
const input =
|
||||
\\{
|
||||
\\ "backslash": "\\",
|
||||
\\ "forwardslash": "\/",
|
||||
\\ "newline": "\n",
|
||||
\\ "carriagereturn": "\r",
|
||||
\\ "tab": "\t",
|
||||
\\ "formfeed": "\f",
|
||||
\\ "backspace": "\b",
|
||||
\\ "doublequote": "\"",
|
||||
\\ "unicode": "\u0105",
|
||||
\\ "surrogatepair": "\ud83d\ude02"
|
||||
\\}
|
||||
;
|
||||
|
||||
var p = Parser.init(debug.global_allocator, false);
|
||||
const tree = try p.parse(input);
|
||||
|
||||
const obj = tree.root.Object;
|
||||
|
||||
testing.expectEqualSlices(u8, obj.get("backslash").?.value.String, "\\");
|
||||
testing.expectEqualSlices(u8, obj.get("forwardslash").?.value.String, "/");
|
||||
testing.expectEqualSlices(u8, obj.get("newline").?.value.String, "\n");
|
||||
testing.expectEqualSlices(u8, obj.get("carriagereturn").?.value.String, "\r");
|
||||
testing.expectEqualSlices(u8, obj.get("tab").?.value.String, "\t");
|
||||
testing.expectEqualSlices(u8, obj.get("formfeed").?.value.String, "\x0C");
|
||||
testing.expectEqualSlices(u8, obj.get("backspace").?.value.String, "\x08");
|
||||
testing.expectEqualSlices(u8, obj.get("doublequote").?.value.String, "\"");
|
||||
testing.expectEqualSlices(u8, obj.get("unicode").?.value.String, "ą");
|
||||
testing.expectEqualSlices(u8, obj.get("surrogatepair").?.value.String, "😂");
|
||||
}
|
||||
|
||||
@ -7,14 +7,46 @@ const std = @import("../std.zig");
|
||||
|
||||
fn ok(comptime s: []const u8) void {
|
||||
std.testing.expect(std.json.validate(s));
|
||||
|
||||
var mem_buffer: [1024 * 20]u8 = undefined;
|
||||
const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
|
||||
var p = std.json.Parser.init(allocator, false);
|
||||
|
||||
_ = p.parse(s) catch unreachable;
|
||||
}
|
||||
|
||||
fn err(comptime s: []const u8) void {
|
||||
std.testing.expect(!std.json.validate(s));
|
||||
|
||||
var mem_buffer: [1024 * 20]u8 = undefined;
|
||||
const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
|
||||
var p = std.json.Parser.init(allocator, false);
|
||||
|
||||
if(p.parse(s)) |_| {
|
||||
unreachable;
|
||||
} else |_| {}
|
||||
}
|
||||
|
||||
fn any(comptime s: []const u8) void {
|
||||
std.testing.expect(true);
|
||||
_ = std.json.validate(s);
|
||||
|
||||
var mem_buffer: [1024 * 20]u8 = undefined;
|
||||
const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
|
||||
var p = std.json.Parser.init(allocator, false);
|
||||
|
||||
_ = p.parse(s) catch {};
|
||||
}
|
||||
|
||||
fn anyStreamingErrNonStreaming(comptime s: []const u8) void {
|
||||
_ = std.json.validate(s);
|
||||
|
||||
var mem_buffer: [1024 * 20]u8 = undefined;
|
||||
const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
|
||||
var p = std.json.Parser.init(allocator, false);
|
||||
|
||||
if(p.parse(s)) |_| {
|
||||
unreachable;
|
||||
} else |_| {}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -611,9 +643,9 @@ test "n_array_colon_instead_of_comma" {
|
||||
}
|
||||
|
||||
test "n_array_comma_after_close" {
|
||||
//err(
|
||||
// \\[""],
|
||||
//);
|
||||
err(
|
||||
\\[""],
|
||||
);
|
||||
}
|
||||
|
||||
test "n_array_comma_and_number" {
|
||||
@ -641,9 +673,9 @@ test "n_array_extra_close" {
|
||||
}
|
||||
|
||||
test "n_array_extra_comma" {
|
||||
//err(
|
||||
// \\["",]
|
||||
//);
|
||||
err(
|
||||
\\["",]
|
||||
);
|
||||
}
|
||||
|
||||
test "n_array_incomplete_invalid_value" {
|
||||
@ -1708,9 +1740,11 @@ test "i_number_double_huge_neg_exp" {
|
||||
}
|
||||
|
||||
test "i_number_huge_exp" {
|
||||
any(
|
||||
\\[0.4e00669999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999969999999006]
|
||||
);
|
||||
return error.SkipZigTest;
|
||||
// FIXME Integer overflow in parseFloat
|
||||
// any(
|
||||
// \\[0.4e00669999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999969999999006]
|
||||
// );
|
||||
}
|
||||
|
||||
test "i_number_neg_int_huge_exp" {
|
||||
@ -1762,49 +1796,49 @@ test "i_number_very_big_negative_int" {
|
||||
}
|
||||
|
||||
test "i_object_key_lone_2nd_surrogate" {
|
||||
any(
|
||||
anyStreamingErrNonStreaming(
|
||||
\\{"\uDFAA":0}
|
||||
);
|
||||
}
|
||||
|
||||
test "i_string_1st_surrogate_but_2nd_missing" {
|
||||
any(
|
||||
anyStreamingErrNonStreaming(
|
||||
\\["\uDADA"]
|
||||
);
|
||||
}
|
||||
|
||||
test "i_string_1st_valid_surrogate_2nd_invalid" {
|
||||
any(
|
||||
anyStreamingErrNonStreaming(
|
||||
\\["\uD888\u1234"]
|
||||
);
|
||||
}
|
||||
|
||||
test "i_string_incomplete_surrogate_and_escape_valid" {
|
||||
any(
|
||||
anyStreamingErrNonStreaming(
|
||||
\\["\uD800\n"]
|
||||
);
|
||||
}
|
||||
|
||||
test "i_string_incomplete_surrogate_pair" {
|
||||
any(
|
||||
anyStreamingErrNonStreaming(
|
||||
\\["\uDd1ea"]
|
||||
);
|
||||
}
|
||||
|
||||
test "i_string_incomplete_surrogates_escape_valid" {
|
||||
any(
|
||||
anyStreamingErrNonStreaming(
|
||||
\\["\uD800\uD800\n"]
|
||||
);
|
||||
}
|
||||
|
||||
test "i_string_invalid_lonely_surrogate" {
|
||||
any(
|
||||
anyStreamingErrNonStreaming(
|
||||
\\["\ud800"]
|
||||
);
|
||||
}
|
||||
|
||||
test "i_string_invalid_surrogate" {
|
||||
any(
|
||||
anyStreamingErrNonStreaming(
|
||||
\\["\ud800abc"]
|
||||
);
|
||||
}
|
||||
@ -1816,7 +1850,7 @@ test "i_string_invalid_utf-8" {
|
||||
}
|
||||
|
||||
test "i_string_inverted_surrogates_U+1D11E" {
|
||||
any(
|
||||
anyStreamingErrNonStreaming(
|
||||
\\["\uDd1e\uD834"]
|
||||
);
|
||||
}
|
||||
@ -1828,7 +1862,7 @@ test "i_string_iso_latin_1" {
|
||||
}
|
||||
|
||||
test "i_string_lone_second_surrogate" {
|
||||
any(
|
||||
anyStreamingErrNonStreaming(
|
||||
\\["\uDFAA"]
|
||||
);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user