From edf487b12645ced9e0745deb32b47c594f1f3072 Mon Sep 17 00:00:00 2001 From: daurnimator Date: Tue, 25 Feb 2020 20:30:57 +1100 Subject: [PATCH] std: add options to std.json.stringfy to control escaping --- lib/std/json.zig | 93 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 28 deletions(-) diff --git a/lib/std/json.zig b/lib/std/json.zig index 98e800edc6..6a82d4c9d6 100644 --- a/lib/std/json.zig +++ b/lib/std/json.zig @@ -2357,10 +2357,38 @@ pub const StringifyOptions = struct { /// Controls the whitespace emitted whitespace: ?Whitespace = null, - // TODO: make escaping '/' in strings optional? + /// Should '/' be escaped in strings? + escape_solidus: bool = false, + + /// Should unicode characters be escaped in strings? + escape_unicode: bool = false, + // TODO: allow picking if []u8 is string or array? }; +fn outputUnicodeEscape( + codepoint: u21, + out_stream: var, +) !void { + if (codepoint <= 0xFFFF) { + // If the character is in the Basic Multilingual Plane (U+0000 through U+FFFF), + // then it may be represented as a six-character sequence: a reverse solidus, followed + // by the lowercase letter u, followed by four hexadecimal digits that encode the character's code point. + try out_stream.writeAll("\\u"); + try std.fmt.formatIntValue(codepoint, "x", std.fmt.FormatOptions{ .width = 4, .fill = '0' }, out_stream); + } else { + assert(codepoint <= 0x10FFFF); + // To escape an extended character that is not in the Basic Multilingual Plane, + // the character is represented as a 12-character sequence, encoding the UTF-16 surrogate pair. + const high = @intCast(u16, (codepoint - 0x10000) >> 10) + 0xD800; + const low = @intCast(u16, codepoint & 0x3FF) + 0xDC00; + try out_stream.writeAll("\\u"); + try std.fmt.formatIntValue(high, "x", std.fmt.FormatOptions{ .width = 4, .fill = '0' }, out_stream); + try out_stream.writeAll("\\u"); + try std.fmt.formatIntValue(low, "x", std.fmt.FormatOptions{ .width = 4, .fill = '0' }, out_stream); + } +} + pub fn stringify( value: var, options: StringifyOptions, @@ -2467,12 +2495,21 @@ pub fn stringify( var i: usize = 0; while (i < value.len) : (i += 1) { switch (value[i]) { - // normal ascii characters - 0x20...0x21, 0x23...0x2E, 0x30...0x5B, 0x5D...0x7F => try out_stream.writeAll(value[i .. i + 1]), - // control characters with short escapes + // normal ascii character + 0x20...0x21, 0x23...0x2E, 0x30...0x5B, 0x5D...0x7F => |c| try out_stream.writeByte(c), + // only 2 characters that *must* be escaped '\\' => try out_stream.writeAll("\\\\"), '\"' => try out_stream.writeAll("\\\""), - '/' => try out_stream.writeAll("\\/"), + // solidus is optional to escape + '/' => { + if (options.escape_solidus) { + try out_stream.writeAll("\\/"); + } else { + try out_stream.writeByte('\\'); + } + }, + // control characters with short escapes + // TODO: option to switch between unicode and 'short' forms? 0x8 => try out_stream.writeAll("\\b"), 0xC => try out_stream.writeAll("\\f"), '\n' => try out_stream.writeAll("\\n"), @@ -2480,22 +2517,12 @@ pub fn stringify( '\t' => try out_stream.writeAll("\\t"), else => { const ulen = std.unicode.utf8ByteSequenceLength(value[i]) catch unreachable; - const codepoint = std.unicode.utf8Decode(value[i .. i + ulen]) catch unreachable; - if (codepoint <= 0xFFFF) { - // If the character is in the Basic Multilingual Plane (U+0000 through U+FFFF), - // then it may be represented as a six-character sequence: a reverse solidus, followed - // by the lowercase letter u, followed by four hexadecimal digits that encode the character's code point. - try out_stream.writeAll("\\u"); - try std.fmt.formatIntValue(codepoint, "x", std.fmt.FormatOptions{ .width = 4, .fill = '0' }, out_stream); + // control characters (only things left with 1 byte length) should always be printed as unicode escapes + if (ulen == 1 or options.escape_unicode) { + const codepoint = std.unicode.utf8Decode(value[i .. i + ulen]) catch unreachable; + try outputUnicodeEscape(codepoint, out_stream); } else { - // To escape an extended character that is not in the Basic Multilingual Plane, - // the character is represented as a 12-character sequence, encoding the UTF-16 surrogate pair. - const high = @intCast(u16, (codepoint - 0x10000) >> 10) + 0xD800; - const low = @intCast(u16, codepoint & 0x3FF) + 0xDC00; - try out_stream.writeAll("\\u"); - try std.fmt.formatIntValue(high, "x", std.fmt.FormatOptions{ .width = 4, .fill = '0' }, out_stream); - try out_stream.writeAll("\\u"); - try std.fmt.formatIntValue(low, "x", std.fmt.FormatOptions{ .width = 4, .fill = '0' }, out_stream); + try out_stream.writeAll(value[i .. i + ulen]); } i += ulen - 1; }, @@ -2609,15 +2636,25 @@ test "stringify basic types" { test "stringify string" { try teststringify("\"hello\"", "hello", StringifyOptions{}); try teststringify("\"with\\nescapes\\r\"", "with\nescapes\r", StringifyOptions{}); + try teststringify("\"with\\nescapes\\r\"", "with\nescapes\r", StringifyOptions{ .escape_unicode = true }); try teststringify("\"with unicode\\u0001\"", "with unicode\u{1}", StringifyOptions{}); - try teststringify("\"with unicode\\u0080\"", "with unicode\u{80}", StringifyOptions{}); - try teststringify("\"with unicode\\u00ff\"", "with unicode\u{FF}", StringifyOptions{}); - try teststringify("\"with unicode\\u0100\"", "with unicode\u{100}", StringifyOptions{}); - try teststringify("\"with unicode\\u0800\"", "with unicode\u{800}", StringifyOptions{}); - try teststringify("\"with unicode\\u8000\"", "with unicode\u{8000}", StringifyOptions{}); - try teststringify("\"with unicode\\ud799\"", "with unicode\u{D799}", StringifyOptions{}); - try teststringify("\"with unicode\\ud800\\udc00\"", "with unicode\u{10000}", StringifyOptions{}); - try teststringify("\"with unicode\\udbff\\udfff\"", "with unicode\u{10FFFF}", StringifyOptions{}); + try teststringify("\"with unicode\\u0001\"", "with unicode\u{1}", StringifyOptions{ .escape_unicode = true }); + try teststringify("\"with unicode\u{80}\"", "with unicode\u{80}", StringifyOptions{}); + try teststringify("\"with unicode\\u0080\"", "with unicode\u{80}", StringifyOptions{ .escape_unicode = true }); + try teststringify("\"with unicode\u{FF}\"", "with unicode\u{FF}", StringifyOptions{}); + try teststringify("\"with unicode\\u00ff\"", "with unicode\u{FF}", StringifyOptions{ .escape_unicode = true }); + try teststringify("\"with unicode\u{100}\"", "with unicode\u{100}", StringifyOptions{}); + try teststringify("\"with unicode\\u0100\"", "with unicode\u{100}", StringifyOptions{ .escape_unicode = true }); + try teststringify("\"with unicode\u{800}\"", "with unicode\u{800}", StringifyOptions{}); + try teststringify("\"with unicode\\u0800\"", "with unicode\u{800}", StringifyOptions{ .escape_unicode = true }); + try teststringify("\"with unicode\u{8000}\"", "with unicode\u{8000}", StringifyOptions{}); + try teststringify("\"with unicode\\u8000\"", "with unicode\u{8000}", StringifyOptions{ .escape_unicode = true }); + try teststringify("\"with unicode\u{D799}\"", "with unicode\u{D799}", StringifyOptions{}); + try teststringify("\"with unicode\\ud799\"", "with unicode\u{D799}", StringifyOptions{ .escape_unicode = true }); + try teststringify("\"with unicode\u{10000}\"", "with unicode\u{10000}", StringifyOptions{}); + try teststringify("\"with unicode\\ud800\\udc00\"", "with unicode\u{10000}", StringifyOptions{ .escape_unicode = true }); + try teststringify("\"with unicode\u{10FFFF}\"", "with unicode\u{10FFFF}", StringifyOptions{}); + try teststringify("\"with unicode\\udbff\\udfff\"", "with unicode\u{10FFFF}", StringifyOptions{ .escape_unicode = true }); } test "stringify tagged unions" {