zig/lib/std/zig/string_literal.zig
Andrew Kelley 2a990d6966 stage1: rework tokenizer to match stage2
* Extracts AstGen logic from ir.cpp into astgen.cpp. Reduces the
   largest file of stage1 from 33,551 lines to 25,510.
 * tokenizer: rework it completely to match the stage2 tokenizer logic.
   They can now be maintained together; when one is changed, the other
   can be changed in the same way.
   - Each token now takes up 13 bytes instead of 64 bytes. The tokenizer
     does not parse char literals, string literals, integer literals,
     etc into meaningful data. Instead, that happens during parsing or
     astgen.
   - no longer store line offsets. Error messages scan source
     files to find the line/column as needed (same as stage2).
   - main loop: instead of checking the loop, handle a null byte
     explicitly in the switch statements. This is a nice improvement
     that we may want to backport to stage2.
   - delete some dead tokens, artifacts of past syntax that no longer
     exists.
 * Parser: fix a TODO by parsing builtin functions as tokens rather than
   `@` as a separate token. This is how stage2 does it.
 * Remove some debugging infrastructure. These will need to be redone,
   if at all, as the code migrates to match stage2.
   - remove the ast_render code.
   - remove the IR debugging stuff
   - remove teh token printing code
2021-05-28 12:58:40 -07:00

161 lines
6.3 KiB
Zig

// SPDX-License-Identifier: MIT
// Copyright (c) 2015-2021 Zig Contributors
// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
// The MIT license requires this copyright notice to be included in all copies
// and substantial portions of the software.
const std = @import("../std.zig");
const assert = std.debug.assert;
pub const ParseError = error{
OutOfMemory,
InvalidStringLiteral,
};
pub const Result = union(enum) {
success,
/// Found an invalid character at this index.
invalid_character: usize,
/// Expected hex digits at this index.
expected_hex_digits: usize,
/// Invalid hex digits at this index.
invalid_hex_escape: usize,
/// Invalid unicode escape at this index.
invalid_unicode_escape: usize,
/// The left brace at this index is missing a matching right brace.
missing_matching_rbrace: usize,
/// Expected unicode digits at this index.
expected_unicode_digits: usize,
};
/// Parses `bytes` as a Zig string literal and appends the result to `buf`.
/// Asserts `bytes` has '"' at beginning and end.
pub fn parseAppend(buf: *std.ArrayList(u8), bytes: []const u8) error{OutOfMemory}!Result {
assert(bytes.len >= 2 and bytes[0] == '"' and bytes[bytes.len - 1] == '"');
const slice = bytes[1..];
const prev_len = buf.items.len;
try buf.ensureCapacity(prev_len + slice.len - 1);
errdefer buf.shrinkRetainingCapacity(prev_len);
const State = enum {
Start,
Backslash,
};
var state = State.Start;
var index: usize = 0;
while (true) : (index += 1) {
const b = slice[index];
switch (state) {
State.Start => switch (b) {
'\\' => state = State.Backslash,
'\n' => {
return Result{ .invalid_character = index };
},
'"' => return Result.success,
else => try buf.append(b),
},
State.Backslash => switch (b) {
'n' => {
try buf.append('\n');
state = State.Start;
},
'r' => {
try buf.append('\r');
state = State.Start;
},
'\\' => {
try buf.append('\\');
state = State.Start;
},
't' => {
try buf.append('\t');
state = State.Start;
},
'\'' => {
try buf.append('\'');
state = State.Start;
},
'"' => {
try buf.append('"');
state = State.Start;
},
'x' => {
// TODO: add more/better/broader tests for this.
const index_continue = index + 3;
if (slice.len < index_continue) {
return Result{ .expected_hex_digits = index };
}
if (std.fmt.parseUnsigned(u8, slice[index + 1 .. index_continue], 16)) |byte| {
try buf.append(byte);
state = State.Start;
index = index_continue - 1; // loop-header increments again
} else |err| switch (err) {
error.Overflow => unreachable, // 2 digits base 16 fits in a u8.
error.InvalidCharacter => {
return Result{ .invalid_hex_escape = index + 1 };
},
}
},
'u' => {
// TODO: add more/better/broader tests for this.
// TODO: we are already inside a nice, clean state machine... use it
// instead of this hacky code.
if (slice.len > index + 2 and slice[index + 1] == '{') {
if (std.mem.indexOfScalarPos(u8, slice[0..std.math.min(index + 9, slice.len)], index + 3, '}')) |index_end| {
const hex_str = slice[index + 2 .. index_end];
if (std.fmt.parseUnsigned(u32, hex_str, 16)) |uint| {
if (uint <= 0x10ffff) {
// TODO this incorrectly depends on endianness
try buf.appendSlice(std.mem.toBytes(uint)[0..]);
state = State.Start;
index = index_end; // loop-header increments
continue;
}
} else |err| switch (err) {
error.Overflow => unreachable,
error.InvalidCharacter => {
return Result{ .invalid_unicode_escape = index + 1 };
},
}
} else {
return Result{ .missing_matching_rbrace = index + 1 };
}
} else {
return Result{ .expected_unicode_digits = index };
}
},
else => {
return Result{ .invalid_character = index };
},
},
}
} else unreachable; // TODO should not need else unreachable on while(true)
}
/// Higher level API. Does not return extra info about parse errors.
/// Caller owns returned memory.
pub fn parseAlloc(allocator: *std.mem.Allocator, bytes: []const u8) ParseError![]u8 {
var buf = std.ArrayList(u8).init(allocator);
defer buf.deinit();
switch (try parseAppend(&buf, bytes)) {
.success => return buf.toOwnedSlice(),
else => return error.InvalidStringLiteral,
}
}
test "parse" {
const expect = std.testing.expect;
const eql = std.mem.eql;
var fixed_buf_mem: [32]u8 = undefined;
var fixed_buf_alloc = std.heap.FixedBufferAllocator.init(fixed_buf_mem[0..]);
var alloc = &fixed_buf_alloc.allocator;
try expect(eql(u8, "foo", try parseAlloc(alloc, "\"foo\"")));
try expect(eql(u8, "foo", try parseAlloc(alloc, "\"f\x6f\x6f\"")));
try expect(eql(u8, "f💯", try parseAlloc(alloc, "\"f\u{1f4af}\"")));
}