diff --git a/lib/std/os/bits/linux.zig b/lib/std/os/bits/linux.zig index 41d681554e..4ad1fc7de4 100644 --- a/lib/std/os/bits/linux.zig +++ b/lib/std/os/bits/linux.zig @@ -19,6 +19,7 @@ pub usingnamespace switch (builtin.arch) { }; pub usingnamespace @import("linux/netlink.zig"); +pub const bpf = @import("linux/bpf.zig"); const is_mips = builtin.arch.isMIPS(); diff --git a/lib/std/os/bits/linux/bpf.zig b/lib/std/os/bits/linux/bpf.zig new file mode 100644 index 0000000000..150d3e9135 --- /dev/null +++ b/lib/std/os/bits/linux/bpf.zig @@ -0,0 +1,606 @@ +usingnamespace std.os; +const std = @import("../../../std.zig"); + +// instruction classes +/// jmp mode in word width +pub const JMP32 = 0x06; +/// alu mode in double word width +pub const ALU64 = 0x07; + +// ld/ldx fields +/// double word (64-bit) +pub const DW = 0x18; +/// exclusive add +pub const XADD = 0xc0; + +// alu/jmp fields +/// mov reg to reg +pub const MOV = 0xb0; +/// sign extending arithmetic shift right */ +pub const ARSH = 0xc0; + +// change endianness of a register +/// flags for endianness conversion: +pub const END = 0xd0; +/// convert to little-endian */ +pub const TO_LE = 0x00; +/// convert to big-endian +pub const TO_BE = 0x08; +pub const FROM_LE = TO_LE; +pub const FROM_BE = TO_BE; + +// jmp encodings +/// jump != * +pub const JNE = 0x50; +/// LT is unsigned, '<' +pub const JLT = 0xa0; +/// LE is unsigned, '<=' * +pub const JLE = 0xb0; +/// SGT is signed '>', GT in x86 +pub const JSGT = 0x60; +/// SGE is signed '>=', GE in x86 +pub const JSGE = 0x70; +/// SLT is signed, '<' +pub const JSLT = 0xc0; +/// SLE is signed, '<=' +pub const JSLE = 0xd0; +/// function call +pub const CALL = 0x80; +/// function return +pub const EXIT = 0x90; + +/// Flag for prog_attach command. If a sub-cgroup installs some bpf program, the +/// program in this cgroup yields to sub-cgroup program. +pub const F_ALLOW_OVERRIDE = 0x1; +/// Flag for prog_attach command. If a sub-cgroup installs some bpf program, +/// that cgroup program gets run in addition to the program in this cgroup. +pub const F_ALLOW_MULTI = 0x2; +/// Flag for prog_attach command. +pub const F_REPLACE = 0x4; + +/// If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the verifier +/// will perform strict alignment checking as if the kernel has been built with +/// CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, and NET_IP_ALIGN defined to 2. +pub const F_STRICT_ALIGNMENT = 0x1; + +/// If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the verifier will +/// allow any alignment whatsoever. On platforms with strict alignment +/// requirements for loads ands stores (such as sparc and mips) the verifier +/// validates that all loads and stores provably follow this requirement. This +/// flag turns that checking and enforcement off. +/// +/// It is mostly used for testing when we want to validate the context and +/// memory access aspects of the verifier, but because of an unaligned access +/// the alignment check would trigger before the one we are interested in. +pub const F_ANY_ALIGNMENT = 0x2; + +/// BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. +/// Verifier does sub-register def/use analysis and identifies instructions +/// whose def only matters for low 32-bit, high 32-bit is never referenced later +/// through implicit zero extension. Therefore verifier notifies JIT back-ends +/// that it is safe to ignore clearing high 32-bit for these instructions. This +/// saves some back-ends a lot of code-gen. However such optimization is not +/// necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends +/// hence hasn't used verifier's analysis result. But, we really want to have a +/// way to be able to verify the correctness of the described optimization on +/// x86_64 on which testsuites are frequently exercised. +/// +/// So, this flag is introduced. Once it is set, verifier will randomize high +/// 32-bit for those instructions who has been identified as safe to ignore +/// them. Then, if verifier is not doing correct analysis, such randomization +/// will regress tests to expose bugs. +pub const F_TEST_RND_HI32 = 0x4; + +/// When BPF ldimm64's insn[0].src_reg != 0 then this can have two extensions: +/// insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE +/// insn[0].imm: map fd map fd +/// insn[1].imm: 0 offset into value +/// insn[0].off: 0 0 +/// insn[1].off: 0 0 +/// ldimm64 rewrite: address of map address of map[0]+offset +/// verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE +pub const PSEUDO_MAP_FD = 1; +pub const PSEUDO_MAP_VALUE = 2; + +/// when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative +/// offset to another bpf function +pub const PSEUDO_CALL = 1; + +/// flag for BPF_MAP_UPDATE_ELEM command. create new element or update existing +pub const ANY = 0; +/// flag for BPF_MAP_UPDATE_ELEM command. create new element if it didn't exist +pub const NOEXIST = 1; +/// flag for BPF_MAP_UPDATE_ELEM command. update existing element +pub const EXIST = 2; +/// flag for BPF_MAP_UPDATE_ELEM command. spin_lock-ed map_lookup/map_update +pub const F_LOCK = 4; + +/// flag for BPF_MAP_CREATE command */ +pub const BPF_F_NO_PREALLOC = 0x1; +/// flag for BPF_MAP_CREATE command. Instead of having one common LRU list in +/// the BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list which can +/// scale and perform better. Note, the LRU nodes (including free nodes) cannot +/// be moved across different LRU lists. +pub const BPF_F_NO_COMMON_LRU = 0x2; +/// flag for BPF_MAP_CREATE command. Specify numa node during map creation +pub const BPF_F_NUMA_NODE = 0x4; +/// flag for BPF_MAP_CREATE command. Flags for BPF object read access from +/// syscall side +pub const BPF_F_RDONLY = 0x8; +/// flag for BPF_MAP_CREATE command. Flags for BPF object write access from +/// syscall side +pub const BPF_F_WRONLY = 0x10; +/// flag for BPF_MAP_CREATE command. Flag for stack_map, store build_id+offset +/// instead of pointer +pub const BPF_F_STACK_BUILD_ID = 0x20; +/// flag for BPF_MAP_CREATE command. Zero-initialize hash function seed. This +/// should only be used for testing. +pub const BPF_F_ZERO_SEED = 0x40; +/// flag for BPF_MAP_CREATE command Flags for accessing BPF object from program +/// side. +pub const BPF_F_RDONLY_PROG = 0x80; +/// flag for BPF_MAP_CREATE command. Flags for accessing BPF object from program +/// side. +pub const BPF_F_WRONLY_PROG = 0x100; +/// flag for BPF_MAP_CREATE command. Clone map from listener for newly accepted +/// socket +pub const BPF_F_CLONE = 0x200; +/// flag for BPF_MAP_CREATE command. Enable memory-mapping BPF map +pub const BPF_F_MMAPABLE = 0x400; + +/// a single BPF instruction +pub const Insn = packed struct { + code: u8, + dst: u4, + src: u4, + off: i16, + imm: i32, + + /// r0 - r9 are general purpose 64-bit registers, r10 points to the stack + /// frame + pub const Reg = enum(u4) { + r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 + }; + + const alu = 0x04; + const jmp = 0x05; + const mov = 0xb0; + const k = 0; + const exit_code = 0x90; + + // TODO: implement more factory functions for the other instructions + /// load immediate value into a register + pub fn load_imm(dst: Reg, imm: i32) Insn { + return Insn{ + .code = alu | mov | k, + .dst = @enumToInt(dst), + .src = 0, + .off = 0, + .imm = imm, + }; + } + + /// exit BPF program + pub fn exit() Insn { + return Insn{ + .code = jmp | exit_code, + .dst = 0, + .src = 0, + .off = 0, + .imm = 0, + }; + } +}; + +pub const Cmd = extern enum(usize) { + map_create, + map_lookup_elem, + map_update_elem, + map_delete_elem, + map_get_next_key, + prog_load, + obj_pin, + obj_get, + prog_attach, + prog_detach, + prog_test_run, + prog_get_next_id, + map_get_next_id, + prog_get_fd_by_id, + map_get_fd_by_id, + obj_get_info_by_fd, + prog_query, + raw_tracepoint_open, + btf_load, + btf_get_fd_by_id, + task_fd_query, + map_lookup_and_delete_elem, + map_freeze, + btf_get_next_id, + map_lookup_batch, + map_lookup_and_delete_batch, + map_update_batch, + map_delete_batch, + link_create, + link_update, + link_get_fd_by_id, + link_get_next_id, + enable_stats, + iter_create, + link_detach, + _, +}; + +pub const MapType = extern enum(u32) { + unspec, + hash, + array, + prog_array, + perf_event_array, + percpu_hash, + percpu_array, + stack_trace, + cgroup_array, + lru_hash, + lru_percpu_hash, + lpm_trie, + array_of_maps, + hash_of_maps, + devmap, + sockmap, + cpumap, + xskmap, + sockhash, + cgroup_storage, + reuseport_sockarray, + percpu_cgroup_storage, + queue, + stack, + sk_storage, + devmap_hash, + struct_ops, + ringbuf, + _, +}; + +pub const ProgType = extern enum(u32) { + unspec, + socket_filter, + kprobe, + sched_cls, + sched_act, + tracepoint, + xdp, + perf_event, + cgroup_skb, + cgroup_sock, + lwt_in, + lwt_out, + lwt_xmit, + sock_ops, + sk_skb, + cgroup_device, + sk_msg, + raw_tracepoint, + cgroup_sock_addr, + lwt_seg6local, + lirc_mode2, + sk_reuseport, + flow_dissector, + cgroup_sysctl, + raw_tracepoint_writable, + cgroup_sockopt, + tracing, + struct_ops, + ext, + lsm, + sk_lookup, +}; + +pub const AttachType = extern enum(u32) { + cgroup_inet_ingress, + cgroup_inet_egress, + cgroup_inet_sock_create, + cgroup_sock_ops, + sk_skb_stream_parser, + sk_skb_stream_verdict, + cgroup_device, + sk_msg_verdict, + cgroup_inet4_bind, + cgroup_inet6_bind, + cgroup_inet4_connect, + cgroup_inet6_connect, + cgroup_inet4_post_bind, + cgroup_inet6_post_bind, + cgroup_udp4_sendmsg, + cgroup_udp6_sendmsg, + lirc_mode2, + flow_dissector, + cgroup_sysctl, + cgroup_udp4_recvmsg, + cgroup_udp6_recvmsg, + cgroup_getsockopt, + cgroup_setsockopt, + trace_raw_tp, + trace_fentry, + trace_fexit, + modify_return, + lsm_mac, + trace_iter, + cgroup_inet4_getpeername, + cgroup_inet6_getpeername, + cgroup_inet4_getsockname, + cgroup_inet6_getsockname, + xdp_devmap, + cgroup_inet_sock_release, + xdp_cpumap, + sk_lookup, + xdp, + _, +}; + +const obj_name_len = 16; +/// struct used by Cmd.map_create command +pub const MapCreateAttr = extern struct { + /// one of MapType + map_type: u32, + /// size of key in bytes + key_size: u32, + /// size of value in bytes + value_size: u32, + /// max number of entries in a map + max_entries: u32, + /// .map_create related flags + map_flags: u32, + /// fd pointing to the inner map + inner_map_fd: fd_t, + /// numa node (effective only if MapCreateFlags.numa_node is set) + numa_node: u32, + map_name: [obj_name_len]u8, + /// ifindex of netdev to create on + map_ifindex: u32, + /// fd pointing to a BTF type data + btf_fd: fd_t, + /// BTF type_id of the key + btf_key_type_id: u32, + /// BTF type_id of the value + bpf_value_type_id: u32, + /// BTF type_id of a kernel struct stored as the map value + btf_vmlinux_value_type_id: u32, +}; + +/// struct used by Cmd.map_*_elem commands +pub const MapElemAttr = extern struct { + map_fd: fd_t, + key: u64, + result: extern union { + value: u64, + next_key: u64, + }, + flags: u64, +}; + +/// struct used by Cmd.map_*_batch commands +pub const MapBatchAttr = extern struct { + /// start batch, NULL to start from beginning + in_batch: u64, + /// output: next start batch + out_batch: u64, + keys: u64, + values: u64, + /// input/output: + /// input: # of key/value elements + /// output: # of filled elements + count: u32, + map_fd: fd_t, + elem_flags: u64, + flags: u64, +}; + +/// struct used by Cmd.prog_load command +pub const ProgLoadAttr = extern struct { + /// one of ProgType + prog_type: u32, + insn_cnt: u32, + insns: u64, + license: u64, + /// verbosity level of verifier + log_level: u32, + /// size of user buffer + log_size: u32, + /// user supplied buffer + log_buf: u64, + /// not used + kern_version: u32, + prog_flags: u32, + prog_name: [obj_name_len]u8, + /// ifindex of netdev to prep for. For some prog types expected attach + /// type must be known at load time to verify attach type specific parts + /// of prog (context accesses, allowed helpers, etc). + prog_ifindex: u32, + expected_attach_type: u32, + /// fd pointing to BTF type data + prog_btf_fd: fd_t, + /// userspace bpf_func_info size + func_info_rec_size: u32, + func_info: u64, + /// number of bpf_func_info records + func_info_cnt: u32, + /// userspace bpf_line_info size + line_info_rec_size: u32, + line_info: u64, + /// number of bpf_line_info records + line_info_cnt: u32, + /// in-kernel BTF type id to attach to + attact_btf_id: u32, + /// 0 to attach to vmlinux + attach_prog_id: u32, +}; + +/// struct used by Cmd.obj_* commands +pub const ObjAttr = extern struct { + pathname: u64, + bpf_fd: fd_t, + file_flags: u32, +}; + +/// struct used by Cmd.prog_attach/detach commands +pub const ProgAttachAttr = extern struct { + /// container object to attach to + target_fd: fd_t, + /// eBPF program to attach + attach_bpf_fd: fd_t, + attach_type: u32, + attach_flags: u32, + // TODO: BPF_F_REPLACE flags + /// previously attached eBPF program to replace if .replace is used + replace_bpf_fd: fd_t, +}; + +/// struct used by Cmd.prog_test_run command +pub const TestAttr = extern struct { + prog_fd: fd_t, + retval: u32, + /// input: len of data_in + data_size_in: u32, + /// input/output: len of data_out. returns ENOSPC if data_out is too small. + data_size_out: u32, + data_in: u64, + data_out: u64, + repeat: u32, + duration: u32, + /// input: len of ctx_in + ctx_size_in: u32, + /// input/output: len of ctx_out. returns ENOSPC if ctx_out is too small. + ctx_size_out: u32, + ctx_in: u64, + ctx_out: u64, +}; + +/// struct used by Cmd.*_get_*_id commands +pub const GetIdAttr = extern struct { + id: extern union { + start_id: u32, + prog_id: u32, + map_id: u32, + btf_id: u32, + link_id: u32, + }, + next_id: u32, + open_flags: u32, +}; + +/// struct used by Cmd.obj_get_info_by_fd command +pub const InfoAttr = extern struct { + bpf_fd: fd_t, + info_len: u32, + info: u64, +}; + +/// struct used by Cmd.prog_query command +pub const QueryAttr = extern struct { + /// container object to query + target_fd: fd_t, + attach_type: u32, + query_flags: u32, + attach_flags: u32, + prog_ids: u64, + prog_cnt: u32, +}; + +/// struct used by Cmd.raw_tracepoint_open command +pub const RawTracepointAttr = extern struct { + name: u64, + prog_fd: fd_t, +}; + +/// struct used by Cmd.btf_load command +pub const BtfLoadAttr = extern struct { + btf: u64, + btf_log_buf: u64, + btf_size: u32, + btf_log_size: u32, + btf_log_level: u32, +}; + +pub const TaskFdQueryAttr = extern struct { + /// input: pid + pid: pid_t, + /// input: fd + fd: fd_t, + /// input: flags + flags: u32, + /// input/output: buf len + buf_len: u32, + /// input/output: + /// tp_name for tracepoint + /// symbol for kprobe + /// filename for uprobe + buf: u64, + /// output: prod_id + prog_id: u32, + /// output: BPF_FD_TYPE + fd_type: u32, + /// output: probe_offset + probe_offset: u64, + /// output: probe_addr + probe_addr: u64, +}; + +/// struct used by Cmd.link_create command +pub const LinkCreateAttr = extern struct { + /// eBPF program to attach + prog_fd: fd_t, + /// object to attach to + target_fd: fd_t, + attach_type: u32, + /// extra flags + flags: u32, +}; + +/// struct used by Cmd.link_update command +pub const LinkUpdateAttr = extern struct { + link_fd: fd_t, + /// new program to update link with + new_prog_fd: fd_t, + /// extra flags + flags: u32, + /// expected link's program fd, it is specified only if BPF_F_REPLACE is + /// set in flags + old_prog_fd: fd_t, +}; + +/// struct used by Cmd.enable_stats command +pub const EnableStatsAttr = extern struct { + type: u32, +}; + +/// struct used by Cmd.iter_create command +pub const IterCreateAttr = extern struct { + link_fd: fd_t, + flags: u32, +}; + +pub const Attr = extern union { + map_create: MapCreateAttr, + map_elem: MapElemAttr, + map_batch: MapBatchAttr, + prog_load: ProgLoadAttr, + obj: ObjAttr, + prog_attach: ProgAttachAttr, + test_run: TestRunAttr, + get_id: GetIdAttr, + info: InfoAttr, + query: QueryAttr, + raw_tracepoint: RawTracepointAttr, + btf_load: BtfLoadAttr, + task_fd_query: TaskFdQueryAttr, + link_create: LinkCreateAttr, + link_update: LinkUpdateAttr, + enable_stats: EnableStatsAttr, + iter_create: IterCreateAttr, +}; + +pub fn bpf(cmd: Cmd, attr: *Attr, size: u32) usize { + return syscall3(.bpf, @enumToInt(cmd), @ptrToInt(attr), size); +}