diff options
| author | Toke Høiland-Jørgensen <toke@redhat.com> | 2021-07-09 17:51:13 +0200 |
|---|---|---|
| committer | Toke Høiland-Jørgensen <toke@redhat.com> | 2021-11-09 21:33:54 +0100 |
| commit | 64155ff31b7e15635ccae5395cf57caa0834faba (patch) | |
| tree | fd6b5dee1842973c95ebf66dab03913a28ef47e4 | |
| parent | 181bc9e4a64cc89cae2fe5b0091ed328383ad40c (diff) | |
| download | linux-64155ff31b7e15635ccae5395cf57caa0834faba.tar.gz | |
xdp: add dequeue program type for getting packets from a PIFO
Add a new BPF_PROG_TYPE_DEQUEUE, which will be executed by the device
drivers when they can transmit packets (e.g., on xmit complete).
The idea is that the drivers will do something equivalent to:
max_pkts = MIN(dev->free_txring_space, BULK_SIZE);
for (i = 0; i < max_pkts; i++) {
struct dequeue_ctx ctx = {
.egress_ifindex = dev->ifindex,
.txring_free_items = dev->free_txring_space
};
ret = BPF_PROG_RUN(dev->dequeue_prog);
if (ret || !ctx.dequeued_frame)
break;
push_to_txring(ctx.dequeued_frame);
}
ring_tx_doorbell(dev);
This way, the dequeue program can make a decision on which PIFO queue to
pull the packet from (by just keeping references to several PIFO maps,
using direct program references, or using a to-be-defined map-in-map
structure), pick one based on whatever criteria it wants, and return a
frame to the driver. And the driver will do bulking by repeatedly calling
the BPF program, similar to what XDP does on RX today.
This implies that if the BPF dequeue program needs to do expensive lookups
to decide on which queue to dequeue from, it will have to do its own
caching or other mitigation mechanism (we don't do that on behalf of it)
between individual runs.
To actually return the packet, the dequeue program uses a helper which will
put a reference to the packet into the context object, so the caller can
read it from there. This allows us to leverage the existing verifier
reference counting support, so that a packet that is dequeued from a PIFO
map must be either dropped or returned before the dequeue program exits.
For now, the dequeue program can only be run using the bpf_prog_run()
syscall; actually hooking it into the TX path is still TBD. Another
outstanding issue is allowing the dequeue program to access the packet
contents., but this should be doable with a just a bit more surgery to the
verifier.
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
| -rw-r--r-- | include/linux/bpf.h | 2 | ||||
| -rw-r--r-- | include/linux/bpf_types.h | 2 | ||||
| -rw-r--r-- | include/net/xdp.h | 6 | ||||
| -rw-r--r-- | include/uapi/linux/bpf.h | 6 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 1 | ||||
| -rw-r--r-- | net/bpf/test_run.c | 38 | ||||
| -rw-r--r-- | net/core/filter.c | 54 | ||||
| -rw-r--r-- | tools/include/uapi/linux/bpf.h | 6 |
8 files changed, 115 insertions, 0 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fc765cee94839..9dd3d6ecdbb88 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1654,6 +1654,8 @@ int array_map_alloc_check(union bpf_attr *attr); int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_dequeue(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_tracing(struct bpf_prog *prog, diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 00ba2df0e6932..57107b5bd0189 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -10,6 +10,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act, struct __sk_buff, struct sk_buff) BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp, struct xdp_md, struct xdp_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_DEQUEUE, dequeue, + struct dequeue_ctx, struct dequeue_data) #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb, struct __sk_buff, struct sk_buff) diff --git a/include/net/xdp.h b/include/net/xdp.h index b15128dae5ee3..8fae364e92950 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -76,6 +76,12 @@ struct xdp_buff { u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; +struct dequeue_data { + struct xdp_txq_info *txq; + struct xdp_frame *dequeued_pkt; +}; + + static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5099e4dca048a..bf1bc7644f0a1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -951,6 +951,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + BPF_PROG_TYPE_DEQUEUE, }; enum bpf_attach_type { @@ -5537,6 +5538,11 @@ struct xdp_md { __u32 egress_ifindex; /* txq->dev->ifindex */ }; +struct dequeue_ctx { + __u32 egress_ifindex; + __u32 txring_free_items; +}; + /* DEVMAP map-value layout * * The struct data-layout of map-value is a configuration interface. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 50f96ea4452a2..515ceb39d82a4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2124,6 +2124,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_DEQUEUE: case BPF_PROG_TYPE_SYSCALL: case BPF_PROG_TYPE_EXT: if (expected_attach_type) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 46dd957559672..5e806c0276cbb 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -836,6 +836,44 @@ free_ctx: return ret; } +int bpf_prog_test_run_dequeue(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + struct xdp_txq_info txq = { .dev = current->nsproxy->net_ns->loopback_dev }; + struct dequeue_data ctx = { .txq = &txq }; + u32 repeat = kattr->test.repeat; + u32 retval, duration; + int ret = -EINVAL; + + if (prog->expected_attach_type) + return -EINVAL; + + if (kattr->test.data_in || kattr->test.data_size_in || + kattr->test.ctx_in || kattr->test.ctx_out || repeat > 1) + return -EINVAL; + + ret = bpf_test_run(prog, &ctx, repeat, &retval, &duration, true); + if (ret) + goto out; + + if (ctx.dequeued_pkt) { + struct xdp_buff xdp = {}; + u32 size; + + xdp_convert_frame_to_buff(ctx.dequeued_pkt, &xdp); + size = xdp.data_end - xdp.data_meta; + + ret = bpf_test_finish(kattr, uattr, xdp.data_meta, size, + retval, duration); + xdp_return_frame(ctx.dequeued_pkt); + } else { + ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration); + } +out: + return ret; +} + + static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx) { /* make sure the fields we don't use are zeroed */ diff --git a/net/core/filter.c b/net/core/filter.c index c1beade4cae8a..849ac9d593278 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7501,6 +7501,12 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +dequeue_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id); +} + const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; @@ -8196,6 +8202,20 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static bool dequeue_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + switch (off) { + case offsetof(struct dequeue_ctx, egress_ifindex): + return true; + } + return false; +} + static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -9172,6 +9192,29 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 dequeue_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct dequeue_ctx, egress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct dequeue_data, txq), + si->dst_reg, si->src_reg, + offsetof(struct dequeue_data, txq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_txq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; + } + return insn - insn_buf; +} + + /* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of * context Structure, F is Field in context structure that contains a pointer * to Nested Structure of type NS that has the field NF. @@ -9979,6 +10022,17 @@ const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; +const struct bpf_verifier_ops dequeue_verifier_ops = { + .get_func_proto = dequeue_func_proto, + .is_valid_access = dequeue_is_valid_access, + .convert_ctx_access = dequeue_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, +}; + +const struct bpf_prog_ops dequeue_prog_ops = { + .test_run = bpf_prog_test_run_dequeue, +}; + const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = cg_skb_func_proto, .is_valid_access = cg_skb_is_valid_access, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5099e4dca048a..bf1bc7644f0a1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -951,6 +951,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + BPF_PROG_TYPE_DEQUEUE, }; enum bpf_attach_type { @@ -5537,6 +5538,11 @@ struct xdp_md { __u32 egress_ifindex; /* txq->dev->ifindex */ }; +struct dequeue_ctx { + __u32 egress_ifindex; + __u32 txring_free_items; +}; + /* DEVMAP map-value layout * * The struct data-layout of map-value is a configuration interface. |
