Merge pull request #3648 from xackus/json-unescape

breaking: JSON unescape
2026-02-13 21:08:36 +00:00 · 2019-12-29 18:31:10 -05:00 · 2019-12-29 18:31:10 -05:00 · 54231e832b
commit 54231e832b
parent 6af39aa49a 6d3b95a708
2 changed files with 196 additions and 47 deletions
--- a/lib/std/json.zig
+++ b/lib/std/json.zig
@ -10,18 +10,18 @@ const maxInt = std.math.maxInt;

 pub const WriteStream = @import("json/write_stream.zig").WriteStream;

-// A single token slice into the parent string.
-//
-// Use `token.slice()` on the input at the current position to get the current slice.
+/// A single token slice into the parent string.
+///
+/// Use `token.slice()` on the input at the current position to get the current slice.
 pub const Token = struct {
    id: Id,
-    // How many bytes do we skip before counting
+    /// How many bytes do we skip before counting
    offset: u1,
-    // Whether string contains a \uXXXX sequence and cannot be zero-copied
+    /// Whether string contains an escape sequence and cannot be zero-copied
    string_has_escape: bool,
-    // Whether number is simple and can be represented by an integer (i.e. no `.` or `e`)
+    /// Whether number is simple and can be represented by an integer (i.e. no `.` or `e`)
    number_is_integer: bool,
-    // How many bytes from the current position behind the start of this token is.
+    /// How many bytes from the current position behind the start of this token is.
    count: usize,

    pub const Id = enum {
@ -66,7 +66,7 @@ pub const Token = struct {
        };
    }

-    // A marker token is a zero-length
+    /// A marker token is a zero-length
    pub fn initMarker(id: Id) Token {
        return Token{
            .id = id,
@ -77,19 +77,19 @@ pub const Token = struct {
        };
    }

-    // Slice into the underlying input string.
+    /// Slice into the underlying input string.
    pub fn slice(self: Token, input: []const u8, i: usize) []const u8 {
        return input[i + self.offset - self.count .. i + self.offset];
    }
 };

-// A small streaming JSON parser. This accepts input one byte at a time and returns tokens as
-// they are encountered. No copies or allocations are performed during parsing and the entire
-// parsing state requires ~40-50 bytes of stack space.
-//
-// Conforms strictly to RFC8529.
-//
-// For a non-byte based wrapper, consider using TokenStream instead.
+/// A small streaming JSON parser. This accepts input one byte at a time and returns tokens as
+/// they are encountered. No copies or allocations are performed during parsing and the entire
+/// parsing state requires ~40-50 bytes of stack space.
+///
+/// Conforms strictly to RFC8529.
+///
+/// For a non-byte based wrapper, consider using TokenStream instead.
 pub const StreamingParser = struct {
    // Current state
    state: State,
@ -205,10 +205,10 @@ pub const StreamingParser = struct {
        InvalidControlCharacter,
    };

-    // Give another byte to the parser and obtain any new tokens. This may (rarely) return two
-    // tokens. token2 is always null if token1 is null.
-    //
-    // There is currently no error recovery on a bad stream.
+    /// Give another byte to the parser and obtain any new tokens. This may (rarely) return two
+    /// tokens. token2 is always null if token1 is null.
+    ///
+    /// There is currently no error recovery on a bad stream.
    pub fn feed(p: *StreamingParser, c: u8, token1: *?Token, token2: *?Token) Error!void {
        token1.* = null;
        token2.* = null;
@ -866,7 +866,7 @@ pub const StreamingParser = struct {
    }
 };

-// A small wrapper over a StreamingParser for full slices. Returns a stream of json Tokens.
+/// A small wrapper over a StreamingParser for full slices. Returns a stream of json Tokens.
 pub const TokenStream = struct {
    i: usize,
    slice: []const u8,
@ -905,7 +905,13 @@ pub const TokenStream = struct {
            }
        }

-        if (self.parser.complete) {
+        // Without this a bare number fails, becasue the streaming parser doesn't know it ended
+        try self.parser.feed(' ', &t1, &t2);
+        self.i += 1;
+
+        if (t1) |token| {
+            return token;
+        } else if (self.parser.complete) {
            return null;
        } else {
            return error.UnexpectedEndOfJson;
@ -971,8 +977,8 @@ test "json.token" {
    testing.expect((try p.next()) == null);
 }

-// Validate a JSON string. This does not limit number precision so a decoder may not necessarily
-// be able to decode the string even if this returns true.
+/// Validate a JSON string. This does not limit number precision so a decoder may not necessarily
+/// be able to decode the string even if this returns true.
 pub fn validate(s: []const u8) bool {
    var p = StreamingParser.init();

@ -1009,6 +1015,8 @@ pub const ValueTree = struct {
 pub const ObjectMap = StringHashMap(Value);
 pub const Array = ArrayList(Value);

+/// Represents a JSON value
+/// Currently only supports numbers that fit into i64 or f64.
 pub const Value = union(enum) {
    Null,
    Bool: bool,
@ -1055,7 +1063,7 @@ pub const Value = union(enum) {
    }
 };

-// A non-stream JSON parser which constructs a tree of Value's.
+/// A non-stream JSON parser which constructs a tree of Value's.
 pub const Parser = struct {
    allocator: *Allocator,
    state: State,
@ -1124,7 +1132,10 @@ pub const Parser = struct {
                    p.state = State.ObjectValue;
                },
                else => {
-                    unreachable;
+                    // The streaming parser would return an error eventually.
+                    // To prevent invalid state we return an error now.
+                    // TODO make the streaming parser return an error as soon as it encounters an invalid object key
+                    return error.InvalidLiteral;
                },
            },
            State.ObjectValue => {
@ -1266,7 +1277,7 @@ pub const Parser = struct {
        // TODO: We don't strictly have to copy values which do not contain any escape
        // characters if flagged with the option.
        const slice = token.slice(input, i);
-        return Value{ .String = try mem.dupe(allocator, u8, slice) };
+        return Value{ .String = try unescapeStringAlloc(allocator, slice) };
    }

    fn parseNumber(p: *Parser, token: Token, input: []const u8, i: usize) !Value {
@ -1277,6 +1288,77 @@ pub const Parser = struct {
    }
 };

+// Unescape a JSON string
+// Only to be used on strings already validated by the parser
+// (note the unreachable statements and lack of bounds checking)
+// Optimized for arena allocators, uses Allocator.shrink
+//
+// Idea: count how many bytes we will need to allocate in the streaming parser and store it
+// in the token to avoid allocating too much memory or iterating through the string again
+// Downside: need to find how many bytes a unicode escape sequence will produce twice
+fn unescapeStringAlloc(alloc: *Allocator, input: []const u8) ![]u8 {
+    const output = try alloc.alloc(u8, input.len);
+    errdefer alloc.free(output);
+    
+    var inIndex: usize = 0;
+    var outIndex: usize = 0;
+
+    while(inIndex < input.len) {
+        if(input[inIndex] != '\\'){
+            // not an escape sequence
+            output[outIndex] = input[inIndex];
+            inIndex += 1;
+            outIndex += 1;
+        } else if(input[inIndex + 1] != 'u'){
+            // a simple escape sequence
+            output[outIndex] = @as(u8,
+                switch(input[inIndex + 1]){
+                    '\\' => '\\',
+                    '/' => '/',
+                    'n' => '\n',
+                    'r' => '\r',
+                    't' => '\t',
+                    'f' => 12,
+                    'b' => 8,
+                    '"' => '"',
+                    else => unreachable
+                }
+            );
+            inIndex += 2;
+            outIndex += 1;
+        } else {
+            // a unicode escape sequence
+            const firstCodeUnit = std.fmt.parseInt(u16, input[inIndex+2 .. inIndex+6], 16) catch unreachable;
+
+            // guess optimistically that it's not a surrogate pair
+            if(std.unicode.utf8Encode(firstCodeUnit, output[outIndex..])) |byteCount| {
+                outIndex += byteCount;
+                inIndex += 6;
+            } else |err| {
+                // it might be a surrogate pair
+                if(err != error.Utf8CannotEncodeSurrogateHalf) {
+                    return error.InvalidUnicodeHexSymbol;
+                }
+                // check if a second code unit is present
+                if(inIndex + 7 >= input.len or input[inIndex + 6] != '\\' or input[inIndex + 7] != 'u'){
+                    return error.InvalidUnicodeHexSymbol;
+                }
+                
+                const secondCodeUnit = std.fmt.parseInt(u16, input[inIndex+8 .. inIndex+12], 16) catch unreachable;
+                
+                if(std.unicode.utf16leToUtf8(output[outIndex..], [2]u16{ firstCodeUnit, secondCodeUnit })) |byteCount| {
+                    outIndex += byteCount;
+                    inIndex += 12;
+                } else |_| {
+                    return error.InvalidUnicodeHexSymbol;
+                }
+            }
+        }
+    }
+
+    return alloc.shrink(output, outIndex);
+}
+
 test "json.parser.dynamic" {
    var p = Parser.init(debug.global_allocator, false);
    defer p.deinit();
@ -1399,3 +1481,36 @@ test "integer after float has proper type" {
    );
    std.testing.expect(json.Object.getValue("ints").?.Array.at(0) == .Integer);
 }
+
+test "escaped characters" {
+    const input =
+        \\{
+        \\  "backslash": "\\",
+        \\  "forwardslash": "\/",
+        \\  "newline": "\n",
+        \\  "carriagereturn": "\r",
+        \\  "tab": "\t",
+        \\  "formfeed": "\f",
+        \\  "backspace": "\b",
+        \\  "doublequote": "\"",
+        \\  "unicode": "\u0105",
+        \\  "surrogatepair": "\ud83d\ude02"
+        \\}
+    ;
+
+    var p = Parser.init(debug.global_allocator, false);
+    const tree = try p.parse(input);
+
+    const obj = tree.root.Object;
+
+    testing.expectEqualSlices(u8, obj.get("backslash").?.value.String, "\\");
+    testing.expectEqualSlices(u8, obj.get("forwardslash").?.value.String, "/");
+    testing.expectEqualSlices(u8, obj.get("newline").?.value.String, "\n");
+    testing.expectEqualSlices(u8, obj.get("carriagereturn").?.value.String, "\r");
+    testing.expectEqualSlices(u8, obj.get("tab").?.value.String, "\t");
+    testing.expectEqualSlices(u8, obj.get("formfeed").?.value.String, "\x0C");
+    testing.expectEqualSlices(u8, obj.get("backspace").?.value.String, "\x08");
+    testing.expectEqualSlices(u8, obj.get("doublequote").?.value.String, "\"");
+    testing.expectEqualSlices(u8, obj.get("unicode").?.value.String, "ą");
+    testing.expectEqualSlices(u8, obj.get("surrogatepair").?.value.String, "😂");
+}
--- a/lib/std/json/test.zig
+++ b/lib/std/json/test.zig
@ -7,14 +7,46 @@ const std = @import("../std.zig");

 fn ok(comptime s: []const u8) void {
    std.testing.expect(std.json.validate(s));
+
+    var mem_buffer: [1024 * 20]u8 = undefined;
+    const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
+    var p = std.json.Parser.init(allocator, false);
+
+    _ = p.parse(s) catch unreachable;
 }

 fn err(comptime s: []const u8) void {
    std.testing.expect(!std.json.validate(s));
+
+    var mem_buffer: [1024 * 20]u8 = undefined;
+    const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
+    var p = std.json.Parser.init(allocator, false);
+
+    if(p.parse(s)) |_| {
+        unreachable;
+    } else |_| {}
 }

 fn any(comptime s: []const u8) void {
-    std.testing.expect(true);
+    _ = std.json.validate(s);
+
+    var mem_buffer: [1024 * 20]u8 = undefined;
+    const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
+    var p = std.json.Parser.init(allocator, false);
+    
+    _ = p.parse(s) catch {};
+}
+
+fn anyStreamingErrNonStreaming(comptime s: []const u8) void {
+    _ = std.json.validate(s);
+
+    var mem_buffer: [1024 * 20]u8 = undefined;
+    const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
+    var p = std.json.Parser.init(allocator, false);
+
+    if(p.parse(s)) |_| {
+        unreachable;
+    } else |_| {}
 }

 ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -611,9 +643,9 @@ test "n_array_colon_instead_of_comma" {
 }

 test "n_array_comma_after_close" {
-    //err(
-    //    \\[""],
-    //);
+    err(
+        \\[""],
+    );
 }

 test "n_array_comma_and_number" {
@ -641,9 +673,9 @@ test "n_array_extra_close" {
 }

 test "n_array_extra_comma" {
-    //err(
-    //    \\["",]
-    //);
+    err(
+        \\["",]
+    );
 }

 test "n_array_incomplete_invalid_value" {
@ -1708,9 +1740,11 @@ test "i_number_double_huge_neg_exp" {
 }

 test "i_number_huge_exp" {
-    any(
-        \\[0.4e00669999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999969999999006]
-    );
+    return error.SkipZigTest;
+    // FIXME Integer overflow in parseFloat
+//     any(
+//         \\[0.4e00669999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999969999999006]
+//     );
 }

 test "i_number_neg_int_huge_exp" {
@ -1762,49 +1796,49 @@ test "i_number_very_big_negative_int" {
 }

 test "i_object_key_lone_2nd_surrogate" {
-    any(
+    anyStreamingErrNonStreaming(
        \\{"\uDFAA":0}
    );
 }

 test "i_string_1st_surrogate_but_2nd_missing" {
-    any(
+    anyStreamingErrNonStreaming(
        \\["\uDADA"]
    );
 }

 test "i_string_1st_valid_surrogate_2nd_invalid" {
-    any(
+    anyStreamingErrNonStreaming(
        \\["\uD888\u1234"]
    );
 }

 test "i_string_incomplete_surrogate_and_escape_valid" {
-    any(
+    anyStreamingErrNonStreaming(
        \\["\uD800\n"]
    );
 }

 test "i_string_incomplete_surrogate_pair" {
-    any(
+    anyStreamingErrNonStreaming(
        \\["\uDd1ea"]
    );
 }

 test "i_string_incomplete_surrogates_escape_valid" {
-    any(
+    anyStreamingErrNonStreaming(
        \\["\uD800\uD800\n"]
    );
 }

 test "i_string_invalid_lonely_surrogate" {
-    any(
+    anyStreamingErrNonStreaming(
        \\["\ud800"]
    );
 }

 test "i_string_invalid_surrogate" {
-    any(
+    anyStreamingErrNonStreaming(
        \\["\ud800abc"]
    );
 }
@ -1816,7 +1850,7 @@ test "i_string_invalid_utf-8" {
 }

 test "i_string_inverted_surrogates_U+1D11E" {
-    any(
+    anyStreamingErrNonStreaming(
        \\["\uDd1e\uD834"]
    );
 }
@ -1828,7 +1862,7 @@ test "i_string_iso_latin_1" {
 }

 test "i_string_lone_second_surrogate" {
-    any(
+    anyStreamingErrNonStreaming(
        \\["\uDFAA"]
    );
 }