zig/lib/std/zig/string_literal.zig

156 lines
6.0 KiB
Zig

const std = @import("../std.zig");
const assert = std.debug.assert;
pub const ParseError = error{
OutOfMemory,
InvalidStringLiteral,
};
pub const Result = union(enum) {
success,
/// Found an invalid character at this index.
invalid_character: usize,
/// Expected hex digits at this index.
expected_hex_digits: usize,
/// Invalid hex digits at this index.
invalid_hex_escape: usize,
/// Invalid unicode escape at this index.
invalid_unicode_escape: usize,
/// The left brace at this index is missing a matching right brace.
missing_matching_rbrace: usize,
/// Expected unicode digits at this index.
expected_unicode_digits: usize,
};
/// Parses `bytes` as a Zig string literal and appends the result to `buf`.
/// Asserts `bytes` has '"' at beginning and end.
pub fn parseAppend(buf: *std.ArrayList(u8), bytes: []const u8) error{OutOfMemory}!Result {
assert(bytes.len >= 2 and bytes[0] == '"' and bytes[bytes.len - 1] == '"');
const slice = bytes[1..];
const prev_len = buf.items.len;
try buf.ensureUnusedCapacity(slice.len - 1);
errdefer buf.shrinkRetainingCapacity(prev_len);
const State = enum {
Start,
Backslash,
};
var state = State.Start;
var index: usize = 0;
while (true) : (index += 1) {
const b = slice[index];
switch (state) {
State.Start => switch (b) {
'\\' => state = State.Backslash,
'\n' => {
return Result{ .invalid_character = index };
},
'"' => return Result.success,
else => try buf.append(b),
},
State.Backslash => switch (b) {
'n' => {
try buf.append('\n');
state = State.Start;
},
'r' => {
try buf.append('\r');
state = State.Start;
},
'\\' => {
try buf.append('\\');
state = State.Start;
},
't' => {
try buf.append('\t');
state = State.Start;
},
'\'' => {
try buf.append('\'');
state = State.Start;
},
'"' => {
try buf.append('"');
state = State.Start;
},
'x' => {
// TODO: add more/better/broader tests for this.
const index_continue = index + 3;
if (slice.len < index_continue) {
return Result{ .expected_hex_digits = index };
}
if (std.fmt.parseUnsigned(u8, slice[index + 1 .. index_continue], 16)) |byte| {
try buf.append(byte);
state = State.Start;
index = index_continue - 1; // loop-header increments again
} else |err| switch (err) {
error.Overflow => unreachable, // 2 digits base 16 fits in a u8.
error.InvalidCharacter => {
return Result{ .invalid_hex_escape = index + 1 };
},
}
},
'u' => {
// TODO: add more/better/broader tests for this.
// TODO: we are already inside a nice, clean state machine... use it
// instead of this hacky code.
if (slice.len > index + 2 and slice[index + 1] == '{') {
if (std.mem.indexOfScalarPos(u8, slice[0..std.math.min(index + 9, slice.len)], index + 3, '}')) |index_end| {
const hex_str = slice[index + 2 .. index_end];
if (std.fmt.parseUnsigned(u32, hex_str, 16)) |uint| {
if (uint <= 0x10ffff) {
// TODO this incorrectly depends on endianness
try buf.appendSlice(std.mem.toBytes(uint)[0..]);
state = State.Start;
index = index_end; // loop-header increments
continue;
}
} else |err| switch (err) {
error.Overflow => unreachable,
error.InvalidCharacter => {
return Result{ .invalid_unicode_escape = index + 1 };
},
}
} else {
return Result{ .missing_matching_rbrace = index + 1 };
}
} else {
return Result{ .expected_unicode_digits = index };
}
},
else => {
return Result{ .invalid_character = index };
},
},
}
} else unreachable; // TODO should not need else unreachable on while(true)
}
/// Higher level API. Does not return extra info about parse errors.
/// Caller owns returned memory.
pub fn parseAlloc(allocator: *std.mem.Allocator, bytes: []const u8) ParseError![]u8 {
var buf = std.ArrayList(u8).init(allocator);
defer buf.deinit();
switch (try parseAppend(&buf, bytes)) {
.success => return buf.toOwnedSlice(),
else => return error.InvalidStringLiteral,
}
}
test "parse" {
const expect = std.testing.expect;
const eql = std.mem.eql;
var fixed_buf_mem: [32]u8 = undefined;
var fixed_buf_alloc = std.heap.FixedBufferAllocator.init(fixed_buf_mem[0..]);
var alloc = &fixed_buf_alloc.allocator;
try expect(eql(u8, "foo", try parseAlloc(alloc, "\"foo\"")));
try expect(eql(u8, "foo", try parseAlloc(alloc, "\"f\x6f\x6f\"")));
try expect(eql(u8, "f💯", try parseAlloc(alloc, "\"f\u{1f4af}\"")));
}