zig/lib/compiler/aro/aro/Source.zig
Andrew Kelley 240d0b68f6 make aro-based translate-c lazily built from source
Part of #19063.

Primarily, this moves Aro from deps/ to lib/compiler/ so that it can be
lazily compiled from source. src/aro_translate_c.zig is moved to
lib/compiler/aro_translate_c.zig and some of Zig CLI logic moved to a
main() function there.

aro_translate_c.zig becomes the "common" import for clang-based
translate-c.

Not all of the compiler was able to be detangled from Aro, however, so
it still, for now, remains being compiled with the main compiler
sources due to the clang-based translate-c depending on it. Once
aro-based translate-c achieves feature parity with the clang-based
translate-c implementation, the clang-based one can be removed from Zig.

Aro made it unnecessarily difficult to depend on with these .def files
and all these Zig module requirements. I looked at the .def files and
made these observations:

- The canonical source is llvm .def files.
- Therefore there is an update process to sync with llvm that involves
  regenerating the .def files in Aro.
- Therefore you might as well just regenerate the .zig files directly
  and check those into Aro.
- Also with a small amount of tinkering, the file size on disk of these
  generated .zig files can be made many times smaller, without
  compromising type safety in the usage of the data.

This would make things much easier on Zig as downstream project,
particularly we could remove those pesky stubs when bootstrapping.

I have gone ahead with these changes since they unblock me and I will
have a chat with Vexu to see what he thinks.
2024-02-28 13:21:05 -07:00

128 lines
3.8 KiB
Zig

const std = @import("std");
pub const Id = enum(u32) {
unused = 0,
generated = 1,
_,
};
/// Classifies the file for line marker output in -E mode
pub const Kind = enum {
/// regular file
user,
/// Included from a system include directory
system,
/// Included from an "implicit extern C" directory
extern_c_system,
};
pub const Location = struct {
id: Id = .unused,
byte_offset: u32 = 0,
line: u32 = 0,
pub fn eql(a: Location, b: Location) bool {
return a.id == b.id and a.byte_offset == b.byte_offset and a.line == b.line;
}
};
const Source = @This();
path: []const u8,
buf: []const u8,
id: Id,
/// each entry represents a byte position within `buf` where a backslash+newline was deleted
/// from the original raw buffer. The same position can appear multiple times if multiple
/// consecutive splices happened. Guaranteed to be non-decreasing
splice_locs: []const u32,
kind: Kind,
/// Todo: binary search instead of scanning entire `splice_locs`.
pub fn numSplicesBefore(source: Source, byte_offset: u32) u32 {
for (source.splice_locs, 0..) |splice_offset, i| {
if (splice_offset > byte_offset) return @intCast(i);
}
return @intCast(source.splice_locs.len);
}
/// Returns the actual line number (before newline splicing) of a Location
/// This corresponds to what the user would actually see in their text editor
pub fn physicalLine(source: Source, loc: Location) u32 {
return loc.line + source.numSplicesBefore(loc.byte_offset);
}
const LineCol = struct { line: []const u8, line_no: u32, col: u32, width: u32, end_with_splice: bool };
pub fn lineCol(source: Source, loc: Location) LineCol {
var start: usize = 0;
// find the start of the line which is either a newline or a splice
if (std.mem.lastIndexOfScalar(u8, source.buf[0..loc.byte_offset], '\n')) |some| start = some + 1;
const splice_index: u32 = for (source.splice_locs, 0..) |splice_offset, i| {
if (splice_offset > start) {
if (splice_offset < loc.byte_offset) {
start = splice_offset;
break @as(u32, @intCast(i)) + 1;
}
break @intCast(i);
}
} else @intCast(source.splice_locs.len);
var i: usize = start;
var col: u32 = 1;
var width: u32 = 0;
while (i < loc.byte_offset) : (col += 1) { // TODO this is still incorrect, but better
const len = std.unicode.utf8ByteSequenceLength(source.buf[i]) catch {
i += 1;
continue;
};
const cp = std.unicode.utf8Decode(source.buf[i..][0..len]) catch {
i += 1;
continue;
};
width += codepointWidth(cp);
i += len;
}
// find the end of the line which is either a newline, EOF or a splice
var nl = source.buf.len;
var end_with_splice = false;
if (std.mem.indexOfScalar(u8, source.buf[start..], '\n')) |some| nl = some + start;
if (source.splice_locs.len > splice_index and nl > source.splice_locs[splice_index] and source.splice_locs[splice_index] > start) {
end_with_splice = true;
nl = source.splice_locs[splice_index];
}
return .{
.line = source.buf[start..nl],
.line_no = loc.line + splice_index,
.col = col,
.width = width,
.end_with_splice = end_with_splice,
};
}
fn codepointWidth(cp: u32) u32 {
return switch (cp) {
0x1100...0x115F,
0x2329,
0x232A,
0x2E80...0x303F,
0x3040...0x3247,
0x3250...0x4DBF,
0x4E00...0xA4C6,
0xA960...0xA97C,
0xAC00...0xD7A3,
0xF900...0xFAFF,
0xFE10...0xFE19,
0xFE30...0xFE6B,
0xFF01...0xFF60,
0xFFE0...0xFFE6,
0x1B000...0x1B001,
0x1F200...0x1F251,
0x20000...0x3FFFD,
0x1F300...0x1F5FF,
0x1F900...0x1F9FF,
=> 2,
else => 1,
};
}