From 2933a8241a54af436f2df5eac73aa2acf5eabd40 Mon Sep 17 00:00:00 2001
From: hryx <codroid@gmail.com>
Date: Sun, 5 Jan 2020 23:16:38 -0800
Subject: [PATCH] json: disallow overlong and out-of-range UTF-8

Fixes #2379

= Overlong (non-shortest) sequences

UTF-8's unique encoding scheme allows for some Unicode codepoints
to be represented in multiple ways. For any of these characters,
the spec forbids all but the shortest form. These disallowed longer
sequences are called "overlong". As an interesting side effect of
this rule, the bytes C0 and C1 never appear in valid UTF-8.

= Codepoint range

UTF-8 disallows representation of codepoints beyond U+10FFFF,
which is the highest character which can be encoded in UTF-16.
Because a 4-byte sequence is capable of resulting in such characters,
they must be explicitly rejected. This rule also has an interesting
side effect, which is that bytes F5 to FF never appear.

= References

Detecting an overlong version of a codepoint could get gnarly, but
luckily The Unicode Consortium did the hard work by creating this
handy table of valid byte sequences:

https://unicode.org/versions/corrigendum1.html

I thought this mapped nicely to the parser's state machine, so I
rearranged the relevant states to make use of it.
---
 lib/std/json.zig      | 76 ++++++++++++++++++++++++++++++++-----------
 lib/std/json/test.zig | 66 +++++++++++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+), 19 deletions(-)

diff --git a/lib/std/json.zig b/lib/std/json.zig
index a7e98ad1a5..481ca7dd95 100644
--- a/lib/std/json.zig
+++ b/lib/std/json.zig
@@ -87,6 +87,8 @@ pub const StreamingParser = struct {
     string_last_was_high_surrogate: bool,
     // Used inside of StringEscapeHexUnicode* states
     string_unicode_codepoint: u21,
+    // The first byte needs to be stored to validate 3- and 4-byte sequences.
+    sequence_first_byte: u8 = undefined,
     // When in .Number states, is the number a (still) valid integer?
     number_is_integer: bool,
 
@@ -132,9 +134,12 @@ pub const StreamingParser = struct {
         ValueBeginNoClosing,
 
         String,
-        StringUtf8Byte3,
-        StringUtf8Byte2,
-        StringUtf8Byte1,
+        StringUtf8Byte2Of2,
+        StringUtf8Byte2Of3,
+        StringUtf8Byte3Of3,
+        StringUtf8Byte2Of4,
+        StringUtf8Byte3Of4,
+        StringUtf8Byte4Of4,
         StringEscapeCharacter,
         StringEscapeHexUnicode4,
         StringEscapeHexUnicode3,
@@ -581,35 +586,68 @@ pub const StreamingParser = struct {
                     // non-control ascii
                     p.string_last_was_high_surrogate = false;
                 },
-                0xC0...0xDF => {
-                    p.state = .StringUtf8Byte1;
+                0xC2...0xDF => {
+                    p.state = .StringUtf8Byte2Of2;
                 },
                 0xE0...0xEF => {
-                    p.state = .StringUtf8Byte2;
+                    p.state = .StringUtf8Byte2Of3;
+                    p.sequence_first_byte = c;
                 },
-                0xF0...0xFF => {
-                    p.state = .StringUtf8Byte3;
+                0xF0...0xF4 => {
+                    p.state = .StringUtf8Byte2Of4;
+                    p.sequence_first_byte = c;
                 },
                 else => {
                     return error.InvalidUtf8Byte;
                 },
             },
 
-            .StringUtf8Byte3 => switch (c >> 6) {
-                0b10 => p.state = .StringUtf8Byte2,
+            .StringUtf8Byte2Of2 => switch (c >> 6) {
+                0b10 => p.state = .String,
                 else => return error.InvalidUtf8Byte,
             },
-
-            .StringUtf8Byte2 => switch (c >> 6) {
-                0b10 => p.state = .StringUtf8Byte1,
+            .StringUtf8Byte2Of3 => {
+                switch (p.sequence_first_byte) {
+                    0xE0 => switch (c) {
+                        0xA0...0xBF => {},
+                        else => return error.InvalidUtf8Byte,
+                    },
+                    0xE1...0xEF => switch (c) {
+                        0x80...0xBF => {},
+                        else => return error.InvalidUtf8Byte,
+                    },
+                    else => return error.InvalidUtf8Byte,
+                }
+                p.state = .StringUtf8Byte3Of3;
+            },
+            .StringUtf8Byte3Of3 => switch (c) {
+                0x80...0xBF => p.state = .String,
                 else => return error.InvalidUtf8Byte,
             },
-
-            .StringUtf8Byte1 => switch (c >> 6) {
-                0b10 => {
-                    p.state = .String;
-                    p.string_last_was_high_surrogate = false;
-                },
+            .StringUtf8Byte2Of4 => {
+                switch (p.sequence_first_byte) {
+                    0xF0 => switch (c) {
+                        0x90...0xBF => {},
+                        else => return error.InvalidUtf8Byte,
+                    },
+                    0xF1...0xF3 => switch (c) {
+                        0x80...0xBF => {},
+                        else => return error.InvalidUtf8Byte,
+                    },
+                    0xF4 => switch (c) {
+                        0x80...0x8F => {},
+                        else => return error.InvalidUtf8Byte,
+                    },
+                    else => return error.InvalidUtf8Byte,
+                }
+                p.state = .StringUtf8Byte3Of4;
+            },
+            .StringUtf8Byte3Of4 => switch (c) {
+                0x80...0xBF => p.state = .StringUtf8Byte4Of4,
+                else => return error.InvalidUtf8Byte,
+            },
+            .StringUtf8Byte4Of4 => switch (c) {
+                0x80...0xBF => p.state = .String,
                 else => return error.InvalidUtf8Byte,
             },
 
diff --git a/lib/std/json/test.zig b/lib/std/json/test.zig
index 5cc069bda3..28fdca1b0f 100644
--- a/lib/std/json/test.zig
+++ b/lib/std/json/test.zig
@@ -27,6 +27,20 @@ fn err(comptime s: []const u8) void {
     } else |_| {}
 }
 
+fn utf8Error(comptime s: []const u8) void {
+    std.testing.expect(!std.json.validate(s));
+
+    var mem_buffer: [1024 * 20]u8 = undefined;
+    const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
+    var p = std.json.Parser.init(allocator, false);
+
+    if (p.parse(s)) |_| {
+        unreachable;
+    } else |e| {
+        std.testing.expect(e == error.InvalidUtf8Byte);
+    }
+}
+
 fn any(comptime s: []const u8) void {
     _ = std.json.validate(s);
 
@@ -1936,3 +1950,55 @@ test "i_structure_UTF-8_BOM_empty_object" {
         \\ï»¿{}
     );
 }
+
+test "truncated UTF-8 sequence" {
+    utf8Error("\"\xc2\"");
+    utf8Error("\"\xdf\"");
+    utf8Error("\"\xed\xa0\"");
+    utf8Error("\"\xf0\x80\"");
+    utf8Error("\"\xf0\x80\x80\"");
+}
+
+test "invalid continuation byte" {
+    utf8Error("\"\xc2\x00\"");
+    utf8Error("\"\xc2\x7f\"");
+    utf8Error("\"\xc2\xc0\"");
+    utf8Error("\"\xc3\xc1\"");
+    utf8Error("\"\xc4\xf5\"");
+    utf8Error("\"\xc5\xff\"");
+    utf8Error("\"\xe4\x80\x00\"");
+    utf8Error("\"\xe5\x80\x10\"");
+    utf8Error("\"\xe6\x80\xc0\"");
+    utf8Error("\"\xe7\x80\xf5\"");
+    utf8Error("\"\xe8\x00\x80\"");
+    utf8Error("\"\xf2\x00\x80\x80\"");
+    utf8Error("\"\xf0\x80\x00\x80\"");
+    utf8Error("\"\xf1\x80\xc0\x80\"");
+    utf8Error("\"\xf2\x80\x80\x00\"");
+    utf8Error("\"\xf3\x80\x80\xc0\"");
+    utf8Error("\"\xf4\x80\x80\xf5\"");
+}
+
+test "disallowed overlong form" {
+    utf8Error("\"\xc0\x80\"");
+    utf8Error("\"\xc0\x90\"");
+    utf8Error("\"\xc1\x80\"");
+    utf8Error("\"\xc1\x90\"");
+    utf8Error("\"\xe0\x80\x80\"");
+    utf8Error("\"\xf0\x80\x80\x80\"");
+}
+
+test "out of UTF-16 range" {
+    utf8Error("\"\xf4\x90\x80\x80\"");
+    utf8Error("\"\xf5\x80\x80\x80\"");
+    utf8Error("\"\xf6\x80\x80\x80\"");
+    utf8Error("\"\xf7\x80\x80\x80\"");
+    utf8Error("\"\xf8\x80\x80\x80\"");
+    utf8Error("\"\xf9\x80\x80\x80\"");
+    utf8Error("\"\xfa\x80\x80\x80\"");
+    utf8Error("\"\xfb\x80\x80\x80\"");
+    utf8Error("\"\xfc\x80\x80\x80\"");
+    utf8Error("\"\xfd\x80\x80\x80\"");
+    utf8Error("\"\xfe\x80\x80\x80\"");
+    utf8Error("\"\xff\x80\x80\x80\"");
+}