zig/lib/fuzzer.zig
Andrew Kelley 892ce7ef52 rework fuzzing API
The previous API used `std.testing.fuzzInput(.{})` however that has the
problem that users call it multiple times incorrectly, and there might
be work happening to obtain the corpus which should not be included in
coverage analysis, and which must not slow down iteration speed.

This commit restructures it so that the main loop lives in libfuzzer and
directly calls the "test one" function.

In this commit I was a little too aggressive because I made the test
runner export `fuzzer_one` for this purpose. This was motivated by
performance, but it causes "exported symbol collision: fuzzer_one" to
occur when more than one fuzz test is provided.

There are three ways to solve this:

1. libfuzzer needs to be passed a function pointer instead. Possible
   performance downside.

2. build runner needs to build a different process per fuzz test.
   Potentially wasteful and unclear how to isolate them.

3. test runner needs to perform a relocation at runtime to point the
   function call to the relevant unit test. Portability issues and
   dubious performance gains.
2024-09-11 13:41:29 -07:00

523 lines
19 KiB
Zig

const builtin = @import("builtin");
const std = @import("std");
const Allocator = std.mem.Allocator;
const assert = std.debug.assert;
const fatal = std.process.fatal;
const SeenPcsHeader = std.Build.Fuzz.abi.SeenPcsHeader;
pub const std_options = .{
.logFn = logOverride,
};
var log_file: ?std.fs.File = null;
fn logOverride(
comptime level: std.log.Level,
comptime scope: @Type(.enum_literal),
comptime format: []const u8,
args: anytype,
) void {
const f = if (log_file) |f| f else f: {
const f = fuzzer.cache_dir.createFile("tmp/libfuzzer.log", .{}) catch
@panic("failed to open fuzzer log file");
log_file = f;
break :f f;
};
const prefix1 = comptime level.asText();
const prefix2 = if (scope == .default) ": " else "(" ++ @tagName(scope) ++ "): ";
f.writer().print(prefix1 ++ prefix2 ++ format ++ "\n", args) catch @panic("failed to write to fuzzer log");
}
export threadlocal var __sancov_lowest_stack: usize = std.math.maxInt(usize);
export fn __sanitizer_cov_trace_const_cmp1(arg1: u8, arg2: u8) void {
handleCmp(@returnAddress(), arg1, arg2);
}
export fn __sanitizer_cov_trace_cmp1(arg1: u8, arg2: u8) void {
handleCmp(@returnAddress(), arg1, arg2);
}
export fn __sanitizer_cov_trace_const_cmp2(arg1: u16, arg2: u16) void {
handleCmp(@returnAddress(), arg1, arg2);
}
export fn __sanitizer_cov_trace_cmp2(arg1: u16, arg2: u16) void {
handleCmp(@returnAddress(), arg1, arg2);
}
export fn __sanitizer_cov_trace_const_cmp4(arg1: u32, arg2: u32) void {
handleCmp(@returnAddress(), arg1, arg2);
}
export fn __sanitizer_cov_trace_cmp4(arg1: u32, arg2: u32) void {
handleCmp(@returnAddress(), arg1, arg2);
}
export fn __sanitizer_cov_trace_const_cmp8(arg1: u64, arg2: u64) void {
handleCmp(@returnAddress(), arg1, arg2);
}
export fn __sanitizer_cov_trace_cmp8(arg1: u64, arg2: u64) void {
handleCmp(@returnAddress(), arg1, arg2);
}
export fn __sanitizer_cov_trace_switch(val: u64, cases_ptr: [*]u64) void {
const pc = @returnAddress();
const len = cases_ptr[0];
const val_size_in_bits = cases_ptr[1];
const cases = cases_ptr[2..][0..len];
_ = val;
fuzzer.visitPc(pc);
_ = val_size_in_bits;
_ = cases;
//std.log.debug("0x{x}: switch on value {d} ({d} bits) with {d} cases", .{
// pc, val, val_size_in_bits, cases.len,
//});
}
export fn __sanitizer_cov_trace_pc_indir(callee: usize) void {
const pc = @returnAddress();
_ = callee;
fuzzer.visitPc(pc);
//std.log.debug("0x{x}: indirect call to 0x{x}", .{ pc, callee });
}
fn handleCmp(pc: usize, arg1: u64, arg2: u64) void {
fuzzer.visitPc(pc ^ arg1 ^ arg2);
//std.log.debug("0x{x}: comparison of {d} and {d}", .{ pc, arg1, arg2 });
}
const Fuzzer = struct {
gpa: Allocator,
rng: std.Random.DefaultPrng,
input: std.ArrayListUnmanaged(u8),
pcs: []const usize,
pc_counters: []u8,
n_runs: usize,
recent_cases: RunMap,
/// Data collected from code coverage instrumentation from one execution of
/// the test function.
coverage: Coverage,
/// Tracks which PCs have been seen across all runs that do not crash the fuzzer process.
/// Stored in a memory-mapped file so that it can be shared with other
/// processes and viewed while the fuzzer is running.
seen_pcs: MemoryMappedList,
cache_dir: std.fs.Dir,
/// Identifies the file name that will be used to store coverage
/// information, available to other processes.
coverage_id: u64,
const RunMap = std.ArrayHashMapUnmanaged(Run, void, Run.HashContext, false);
const Coverage = struct {
pc_table: std.AutoArrayHashMapUnmanaged(usize, void),
run_id_hasher: std.hash.Wyhash,
fn reset(cov: *Coverage) void {
cov.pc_table.clearRetainingCapacity();
cov.run_id_hasher = std.hash.Wyhash.init(0);
}
};
const Run = struct {
id: Id,
input: []const u8,
score: usize,
const Id = u64;
const HashContext = struct {
pub fn eql(ctx: HashContext, a: Run, b: Run, b_index: usize) bool {
_ = b_index;
_ = ctx;
return a.id == b.id;
}
pub fn hash(ctx: HashContext, a: Run) u32 {
_ = ctx;
return @truncate(a.id);
}
};
fn deinit(run: *Run, gpa: Allocator) void {
gpa.free(run.input);
run.* = undefined;
}
};
const Slice = extern struct {
ptr: [*]const u8,
len: usize,
fn toZig(s: Slice) []const u8 {
return s.ptr[0..s.len];
}
fn fromZig(s: []const u8) Slice {
return .{
.ptr = s.ptr,
.len = s.len,
};
}
};
const Analysis = struct {
score: usize,
id: Run.Id,
};
fn init(f: *Fuzzer, cache_dir: std.fs.Dir, pc_counters: []u8, pcs: []const usize) !void {
f.cache_dir = cache_dir;
f.pc_counters = pc_counters;
f.pcs = pcs;
// Choose a file name for the coverage based on a hash of the PCs that will be stored within.
const pc_digest = std.hash.Wyhash.hash(0, std.mem.sliceAsBytes(pcs));
f.coverage_id = pc_digest;
const hex_digest = std.fmt.hex(pc_digest);
const coverage_file_path = "v/" ++ hex_digest;
// Layout of this file:
// - Header
// - list of PC addresses (usize elements)
// - list of hit flag, 1 bit per address (stored in u8 elements)
const coverage_file = createFileBail(cache_dir, coverage_file_path, .{
.read = true,
.truncate = false,
});
defer coverage_file.close();
const n_bitset_elems = (pcs.len + @bitSizeOf(usize) - 1) / @bitSizeOf(usize);
comptime assert(SeenPcsHeader.trailing[0] == .pc_bits_usize);
comptime assert(SeenPcsHeader.trailing[1] == .pc_addr);
const bytes_len = @sizeOf(SeenPcsHeader) +
n_bitset_elems * @sizeOf(usize) +
pcs.len * @sizeOf(usize);
const existing_len = coverage_file.getEndPos() catch |err| {
fatal("unable to check len of coverage file: {s}", .{@errorName(err)});
};
if (existing_len == 0) {
coverage_file.setEndPos(bytes_len) catch |err| {
fatal("unable to set len of coverage file: {s}", .{@errorName(err)});
};
} else if (existing_len != bytes_len) {
fatal("incompatible existing coverage file (differing lengths)", .{});
}
f.seen_pcs = MemoryMappedList.init(coverage_file, existing_len, bytes_len) catch |err| {
fatal("unable to init coverage memory map: {s}", .{@errorName(err)});
};
if (existing_len != 0) {
const existing_pcs_bytes = f.seen_pcs.items[@sizeOf(SeenPcsHeader) + @sizeOf(usize) * n_bitset_elems ..][0 .. pcs.len * @sizeOf(usize)];
const existing_pcs = std.mem.bytesAsSlice(usize, existing_pcs_bytes);
for (existing_pcs, pcs, 0..) |old, new, i| {
if (old != new) {
fatal("incompatible existing coverage file (differing PC at index {d}: {x} != {x})", .{
i, old, new,
});
}
}
} else {
const header: SeenPcsHeader = .{
.n_runs = 0,
.unique_runs = 0,
.pcs_len = pcs.len,
.lowest_stack = std.math.maxInt(usize),
};
f.seen_pcs.appendSliceAssumeCapacity(std.mem.asBytes(&header));
f.seen_pcs.appendNTimesAssumeCapacity(0, n_bitset_elems * @sizeOf(usize));
f.seen_pcs.appendSliceAssumeCapacity(std.mem.sliceAsBytes(pcs));
}
}
fn analyzeLastRun(f: *Fuzzer) Analysis {
return .{
.id = f.coverage.run_id_hasher.final(),
.score = f.coverage.pc_table.count(),
};
}
fn start(f: *Fuzzer) !void {
const gpa = f.gpa;
const rng = fuzzer.rng.random();
// Prepare initial input.
assert(f.recent_cases.entries.len == 0);
assert(f.n_runs == 0);
try f.recent_cases.ensureUnusedCapacity(gpa, 100);
const len = rng.uintLessThanBiased(usize, 80);
try f.input.resize(gpa, len);
rng.bytes(f.input.items);
f.recent_cases.putAssumeCapacity(.{
.id = 0,
.input = try gpa.dupe(u8, f.input.items),
.score = 0,
}, {});
const header: *volatile SeenPcsHeader = @ptrCast(f.seen_pcs.items[0..@sizeOf(SeenPcsHeader)]);
while (true) {
const chosen_index = rng.uintLessThanBiased(usize, f.recent_cases.entries.len);
const run = &f.recent_cases.keys()[chosen_index];
f.input.clearRetainingCapacity();
f.input.appendSliceAssumeCapacity(run.input);
try f.mutate();
_ = @atomicRmw(usize, &header.lowest_stack, .Min, __sancov_lowest_stack, .monotonic);
@memset(f.pc_counters, 0);
f.coverage.reset();
fuzzer_one(f.input.items.ptr, f.input.items.len);
f.n_runs += 1;
_ = @atomicRmw(usize, &header.n_runs, .Add, 1, .monotonic);
if (f.n_runs % 10000 == 0) f.dumpStats();
const analysis = f.analyzeLastRun();
const gop = f.recent_cases.getOrPutAssumeCapacity(.{
.id = analysis.id,
.input = undefined,
.score = undefined,
});
if (gop.found_existing) {
//std.log.info("duplicate analysis: score={d} id={d}", .{ analysis.score, analysis.id });
if (f.input.items.len < gop.key_ptr.input.len or gop.key_ptr.score == 0) {
gpa.free(gop.key_ptr.input);
gop.key_ptr.input = try gpa.dupe(u8, f.input.items);
gop.key_ptr.score = analysis.score;
}
} else {
std.log.info("unique analysis: score={d} id={d}", .{ analysis.score, analysis.id });
gop.key_ptr.* = .{
.id = analysis.id,
.input = try gpa.dupe(u8, f.input.items),
.score = analysis.score,
};
{
// Track code coverage from all runs.
comptime assert(SeenPcsHeader.trailing[0] == .pc_bits_usize);
const header_end_ptr: [*]volatile usize = @ptrCast(f.seen_pcs.items[@sizeOf(SeenPcsHeader)..]);
const remainder = f.pcs.len % @bitSizeOf(usize);
const aligned_len = f.pcs.len - remainder;
const seen_pcs = header_end_ptr[0..aligned_len];
const pc_counters = std.mem.bytesAsSlice([@bitSizeOf(usize)]u8, f.pc_counters[0..aligned_len]);
const V = @Vector(@bitSizeOf(usize), u8);
const zero_v: V = @splat(0);
for (header_end_ptr[0..pc_counters.len], pc_counters) |*elem, *array| {
const v: V = array.*;
const mask: usize = @bitCast(v != zero_v);
_ = @atomicRmw(usize, elem, .Or, mask, .monotonic);
}
if (remainder > 0) {
const i = pc_counters.len;
const elem = &seen_pcs[i];
var mask: usize = 0;
for (f.pc_counters[i * @bitSizeOf(usize) ..][0..remainder], 0..) |byte, bit_index| {
mask |= @as(usize, @intFromBool(byte != 0)) << @intCast(bit_index);
}
_ = @atomicRmw(usize, elem, .Or, mask, .monotonic);
}
}
_ = @atomicRmw(usize, &header.unique_runs, .Add, 1, .monotonic);
}
if (f.recent_cases.entries.len >= 100) {
const Context = struct {
values: []const Run,
pub fn lessThan(ctx: @This(), a_index: usize, b_index: usize) bool {
return ctx.values[b_index].score < ctx.values[a_index].score;
}
};
f.recent_cases.sortUnstable(Context{ .values = f.recent_cases.keys() });
const cap = 50;
// This has to be done before deinitializing the deleted items.
const doomed_runs = f.recent_cases.keys()[cap..];
f.recent_cases.shrinkRetainingCapacity(cap);
for (doomed_runs) |*doomed_run| {
std.log.info("culling score={d} id={d}", .{ doomed_run.score, doomed_run.id });
doomed_run.deinit(gpa);
}
}
}
}
fn visitPc(f: *Fuzzer, pc: usize) void {
errdefer |err| oom(err);
try f.coverage.pc_table.put(f.gpa, pc, {});
f.coverage.run_id_hasher.update(std.mem.asBytes(&pc));
}
fn dumpStats(f: *Fuzzer) void {
for (f.recent_cases.keys()[0..@min(f.recent_cases.entries.len, 5)], 0..) |run, i| {
std.log.info("best[{d}] id={x} score={d} input: '{}'", .{
i, run.id, run.score, std.zig.fmtEscapes(run.input),
});
}
}
fn mutate(f: *Fuzzer) !void {
const gpa = f.gpa;
const rng = fuzzer.rng.random();
if (f.input.items.len == 0) {
const len = rng.uintLessThanBiased(usize, 80);
try f.input.resize(gpa, len);
rng.bytes(f.input.items);
return;
}
const index = rng.uintLessThanBiased(usize, f.input.items.len * 3);
if (index < f.input.items.len) {
f.input.items[index] = rng.int(u8);
} else if (index < f.input.items.len * 2) {
_ = f.input.orderedRemove(index - f.input.items.len);
} else if (index < f.input.items.len * 3) {
try f.input.insert(gpa, index - f.input.items.len * 2, rng.int(u8));
} else {
unreachable;
}
}
};
fn createFileBail(dir: std.fs.Dir, sub_path: []const u8, flags: std.fs.File.CreateFlags) std.fs.File {
return dir.createFile(sub_path, flags) catch |err| switch (err) {
error.FileNotFound => {
const dir_name = std.fs.path.dirname(sub_path).?;
dir.makePath(dir_name) catch |e| {
fatal("unable to make path '{s}': {s}", .{ dir_name, @errorName(e) });
};
return dir.createFile(sub_path, flags) catch |e| {
fatal("unable to create file '{s}': {s}", .{ sub_path, @errorName(e) });
};
},
else => fatal("unable to create file '{s}': {s}", .{ sub_path, @errorName(err) }),
};
}
fn oom(err: anytype) noreturn {
switch (err) {
error.OutOfMemory => @panic("out of memory"),
}
}
var general_purpose_allocator: std.heap.GeneralPurposeAllocator(.{}) = .{};
var fuzzer: Fuzzer = .{
.gpa = general_purpose_allocator.allocator(),
.rng = std.Random.DefaultPrng.init(0),
.input = .{},
.pcs = undefined,
.pc_counters = undefined,
.n_runs = 0,
.recent_cases = .{},
.coverage = undefined,
.cache_dir = undefined,
.seen_pcs = undefined,
.coverage_id = undefined,
};
/// Invalid until `fuzzer_init` is called.
export fn fuzzer_coverage_id() u64 {
return fuzzer.coverage_id;
}
extern fn fuzzer_one(input_ptr: [*]const u8, input_len: usize) callconv(.C) void;
export fn fuzzer_start() void {
fuzzer.start() catch |err| switch (err) {
error.OutOfMemory => fatal("out of memory", .{}),
};
}
export fn fuzzer_init(cache_dir_struct: Fuzzer.Slice) void {
// Linkers are expected to automatically add `__start_<section>` and
// `__stop_<section>` symbols when section names are valid C identifiers.
const pc_counters_start = @extern([*]u8, .{
.name = "__start___sancov_cntrs",
.linkage = .weak,
}) orelse fatal("missing __start___sancov_cntrs symbol", .{});
const pc_counters_end = @extern([*]u8, .{
.name = "__stop___sancov_cntrs",
.linkage = .weak,
}) orelse fatal("missing __stop___sancov_cntrs symbol", .{});
const pc_counters = pc_counters_start[0 .. pc_counters_end - pc_counters_start];
const pcs_start = @extern([*]usize, .{
.name = "__start___sancov_pcs1",
.linkage = .weak,
}) orelse fatal("missing __start___sancov_pcs1 symbol", .{});
const pcs_end = @extern([*]usize, .{
.name = "__stop___sancov_pcs1",
.linkage = .weak,
}) orelse fatal("missing __stop___sancov_pcs1 symbol", .{});
const pcs = pcs_start[0 .. pcs_end - pcs_start];
const cache_dir_path = cache_dir_struct.toZig();
const cache_dir = if (cache_dir_path.len == 0)
std.fs.cwd()
else
std.fs.cwd().makeOpenPath(cache_dir_path, .{ .iterate = true }) catch |err| {
fatal("unable to open fuzz directory '{s}': {s}", .{ cache_dir_path, @errorName(err) });
};
fuzzer.init(cache_dir, pc_counters, pcs) catch |err|
fatal("unable to init fuzzer: {s}", .{@errorName(err)});
}
/// Like `std.ArrayListUnmanaged(u8)` but backed by memory mapping.
pub const MemoryMappedList = struct {
/// Contents of the list.
///
/// Pointers to elements in this slice are invalidated by various functions
/// of this ArrayList in accordance with the respective documentation. In
/// all cases, "invalidated" means that the memory has been passed to this
/// allocator's resize or free function.
items: []align(std.mem.page_size) volatile u8,
/// How many bytes this list can hold without allocating additional memory.
capacity: usize,
pub fn init(file: std.fs.File, length: usize, capacity: usize) !MemoryMappedList {
const ptr = try std.posix.mmap(
null,
capacity,
std.posix.PROT.READ | std.posix.PROT.WRITE,
.{ .TYPE = .SHARED },
file.handle,
0,
);
return .{
.items = ptr[0..length],
.capacity = capacity,
};
}
/// Append the slice of items to the list.
/// Asserts that the list can hold the additional items.
pub fn appendSliceAssumeCapacity(l: *MemoryMappedList, items: []const u8) void {
const old_len = l.items.len;
const new_len = old_len + items.len;
assert(new_len <= l.capacity);
l.items.len = new_len;
@memcpy(l.items[old_len..][0..items.len], items);
}
/// Append a value to the list `n` times.
/// Never invalidates element pointers.
/// The function is inline so that a comptime-known `value` parameter will
/// have better memset codegen in case it has a repeated byte pattern.
/// Asserts that the list can hold the additional items.
pub inline fn appendNTimesAssumeCapacity(l: *MemoryMappedList, value: u8, n: usize) void {
const new_len = l.items.len + n;
assert(new_len <= l.capacity);
@memset(l.items.ptr[l.items.len..new_len], value);
l.items.len = new_len;
}
};