aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorToke Høiland-Jørgensen <toke@redhat.com>2021-06-30 16:36:10 +0200
committerToke Høiland-Jørgensen <toke@redhat.com>2021-11-09 21:33:54 +0100
commit181bc9e4a64cc89cae2fe5b0091ed328383ad40c (patch)
treed279598446ee7edbda26b68342a490632291c9bb
parentc23551c9c36ae394f9c53a5adf1944a943c65e0b (diff)
downloadlinux-181bc9e4a64cc89cae2fe5b0091ed328383ad40c.tar.gz
Add a PIFO map type for queueing packets
This adds a new PIFO map type for queueing packets for XDP. The map can be used as a target for XDP_REDIRECT, where the index is used as the PIFO rank (although for now, we just ignore the rank and queue everything in a static array). This way, and XDP program can implement queueing policies by redirecting packets into a PIFO where they will be queued until they are pulled out by the new helper defined in subsequent commits. More complicated policies can be implemented by having several PIFOs and picking the one to enqueue to by whichever mechanism the XDP program chooses. In particular, by implementing another PIFO map-in-map type we can enable hierarchical policies like those described by the PIFO and Eiffel papers. Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
-rw-r--r--include/linux/bpf.h3
-rw-r--r--include/linux/bpf_types.h1
-rw-r--r--include/net/xdp.h5
-rw-r--r--include/uapi/linux/bpf.h1
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/pifomap.c360
-rw-r--r--kernel/bpf/verifier.c5
-rw-r--r--net/core/filter.c7
-rw-r--r--tools/include/uapi/linux/bpf.h1
9 files changed, 383 insertions, 1 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index df3410bff4b06..fc765cee94839 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1639,6 +1639,9 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
struct sk_buff *skb);
+int pifo_map_enqueue(struct bpf_map *map, struct xdp_buff *xdp, u32 index);
+struct xdp_frame *pifo_map_dequeue(struct bpf_map *map, u64 flags);
+
/* Return map's numa specified by userspace */
static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
{
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 48a91c51c015d..00ba2df0e6932 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -110,6 +110,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_PIFO, pifo_map_ops)
#if defined(CONFIG_XDP_SOCKETS)
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
#endif
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 447f9b1578f38..b15128dae5ee3 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -121,7 +121,10 @@ struct xdp_frame {
* while mem info is valid on remote CPU.
*/
struct xdp_mem_info mem;
- struct net_device *dev_rx; /* used by cpumap */
+ union {
+ struct net_device *dev_rx; /* used by cpumap */
+ struct xdp_frame *next; /* used by pifomap */
+ };
};
#define XDP_BULK_QUEUE_SIZE 16
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 509eee5f0393d..5099e4dca048a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -907,6 +907,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_INODE_STORAGE,
BPF_MAP_TYPE_TASK_STORAGE,
BPF_MAP_TYPE_BLOOM_FILTER,
+ BPF_MAP_TYPE_PIFO,
};
/* Note that tracing related programs such as
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index cf6ca339f3cd8..a42a25c58f921 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += pifomap.o
obj-$(CONFIG_BPF_SYSCALL) += offload.o
obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
endif
diff --git a/kernel/bpf/pifomap.c b/kernel/bpf/pifomap.c
new file mode 100644
index 0000000000000..0bdc2a89a47fd
--- /dev/null
+++ b/kernel/bpf/pifomap.c
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* Pifomaps queue packets
+ */
+#include <linux/bpf.h>
+#include <linux/bitops.h>
+#include <net/xdp.h>
+#include <linux/filter.h>
+#include <trace/events/xdp.h>
+
+#define PIFO_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+struct bpf_pifo_bucket {
+ struct xdp_frame *head, *tail;
+ u32 frame_count;
+};
+
+struct bpf_pifo_queue {
+ struct bpf_pifo_bucket *buckets;
+ unsigned long *bitmap;
+ unsigned long **lvl_bitmap;
+ u32 min_rank;
+ u32 levels;
+ u32 range;
+};
+
+struct bpf_pifo_map {
+ struct bpf_map map;
+ struct bpf_pifo_queue *queue;
+ long num_queued;
+};
+
+static void pifo_queue_free(struct bpf_pifo_queue *q)
+{
+ bpf_map_area_free(q->buckets);
+ bpf_map_area_free(q->bitmap);
+ bpf_map_area_free(q->lvl_bitmap);
+ kfree(q);
+}
+
+static struct bpf_pifo_queue *pifo_queue_alloc(u32 range, int numa_node)
+{
+ u32 num_longs = 0, offset = 0, i, lvl, levels;
+ struct bpf_pifo_queue *q;
+
+ levels = __KERNEL_DIV_ROUND_UP(ilog2(range), ilog2(BITS_PER_TYPE(long)));
+ for (i = 0, lvl = 1; i < levels; i++) {
+ num_longs += lvl;
+ lvl *= BITS_PER_TYPE(long);
+ }
+
+ q = kzalloc(sizeof(struct bpf_pifo_queue), GFP_USER | __GFP_ACCOUNT);
+ if (!q)
+ return NULL;
+ q->buckets = bpf_map_area_alloc(sizeof(struct bpf_pifo_bucket) * range,
+ numa_node);
+ if (!q->buckets)
+ goto err;
+
+ q->bitmap = bpf_map_area_alloc(sizeof(unsigned long) * num_longs,
+ numa_node);
+ if (!q->bitmap)
+ goto err;
+
+ q->lvl_bitmap = bpf_map_area_alloc(sizeof(unsigned long *) * levels,
+ numa_node);
+ for (i = 0, lvl = 1; i < levels; i++) {
+ q->lvl_bitmap[i] = &q->bitmap[offset];
+ offset += lvl;
+ lvl *= BITS_PER_TYPE(long);
+ }
+ q->levels = levels;
+ q->range = range;
+ return q;
+
+
+err:
+ pifo_queue_free(q);
+ return NULL;
+}
+
+
+static int pifo_map_init_map(struct bpf_pifo_map *pifo, union bpf_attr *attr)
+{
+ u32 range = attr->max_entries;
+
+ /* check sanity of attributes. value size is the number of buckets,
+ * which must be at least 8
+ */
+ if (attr->value_size != 4 || attr->key_size != 4 ||
+ !range || range < 8 || !is_power_of_2(range) ||
+ attr->map_flags & ~PIFO_CREATE_FLAG_MASK)
+ return -EINVAL;
+
+ /* PIFO map is special, we don't want BPF writing straight to it
+ */
+ attr->map_flags |= BPF_F_RDONLY_PROG;
+ bpf_map_init_from_attr(&pifo->map, attr);
+
+ pifo->queue = pifo_queue_alloc(range, pifo->map.numa_node);
+ if (!pifo->queue)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static struct bpf_map *pifo_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_pifo_map *pifo;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ pifo = kzalloc(sizeof(*pifo), GFP_USER | __GFP_ACCOUNT);
+ if (!pifo)
+ return ERR_PTR(-ENOMEM);
+
+ err = pifo_map_init_map(pifo, attr);
+ if (err) {
+ kfree(pifo);
+ return ERR_PTR(err);
+ }
+
+ return &pifo->map;
+}
+
+static void pifo_queue_flush(struct bpf_pifo_queue *queue)
+{
+ unsigned long *bitmap = queue->lvl_bitmap[queue->levels - 1];
+ int i = 0;
+
+ while (i < queue->range) {
+ struct bpf_pifo_bucket *bucket = &queue->buckets[i];
+ struct xdp_frame *frame = bucket->head, *next;
+
+ while(frame) {
+ next = frame->next;
+ xdp_return_frame(frame);
+ frame = next;
+ }
+ i = find_next_bit(bitmap, queue->range, i + 1);
+ }
+}
+
+static void pifo_map_free(struct bpf_map *map)
+{
+ struct bpf_pifo_map *pifo = container_of(map, struct bpf_pifo_map, map);
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. The following synchronize_rcu() guarantees
+ * both rcu read critical sections complete and waits for
+ * preempt-disable regions (NAPI being the relevant context here) so we
+ * are certain there will be no further reads against the netdev_map and
+ * all flush operations are complete. Flush operations can only be done
+ * from NAPI context for this reason.
+ */
+
+ synchronize_rcu();
+
+ /* Make sure prior __dev_map_entry_free() have completed. */
+ rcu_barrier();
+
+ pifo_queue_flush(pifo->queue);
+ pifo_queue_free(pifo->queue);
+ kfree(pifo);
+}
+
+static int pifo_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_pifo_map *pifo = container_of(map, struct bpf_pifo_map, map);
+ u32 index = key ? *(u32 *)key : U32_MAX, offset;
+ struct bpf_pifo_queue *queue = pifo->queue;
+ u32 *next = next_key;
+ unsigned long idx;
+
+
+ if (index == U32_MAX || index < queue->min_rank)
+ offset = queue->min_rank;
+ else
+ offset = index - queue->min_rank + 1;
+
+ if (offset >= queue->range)
+ return -ENOENT;
+
+ idx = find_next_bit(queue->lvl_bitmap[queue->levels - 1],
+ queue->range, offset);
+ if (idx == queue->range)
+ return -ENOENT;
+
+ *next = idx;
+ return 0;
+}
+
+void pifo_set_bit(struct bpf_pifo_queue *queue, u32 rank)
+{
+ u32 i;
+
+ for (i = queue->levels; i > 0; i--) {
+ unsigned long *bitmap = queue->lvl_bitmap[i-1];
+ set_bit(rank, bitmap);
+ rank /= BITS_PER_TYPE(long);
+ }
+}
+
+void pifo_clear_bit(struct bpf_pifo_queue *queue, u32 rank)
+{
+ u32 i;
+
+ for (i = queue->levels; i > 0; i--) {
+ unsigned long *bitmap = queue->lvl_bitmap[i-1];
+ clear_bit(rank, bitmap);
+ rank /= BITS_PER_TYPE(long);
+
+ // another bit is set in this word, don't clear bit in higher
+ // level
+ if (*(bitmap + rank))
+ break;
+ }
+}
+
+int pifo_map_enqueue(struct bpf_map *map, struct xdp_buff *xdp, u32 index)
+{
+ struct bpf_pifo_map *pifo = container_of(map, struct bpf_pifo_map, map);
+ struct bpf_pifo_queue *queue = pifo->queue;
+ struct bpf_pifo_bucket *bucket;
+ struct xdp_frame *xdpf;
+ u32 q_index;
+
+ if (unlikely(pifo->num_queued >= pifo->map.max_entries))
+ return -EOVERFLOW;
+
+ xdpf = xdp_convert_buff_to_frame(xdp);
+ if (unlikely(!xdpf))
+ return -ENOMEM;
+ xdpf->next = NULL;
+
+ q_index = index - min(queue->min_rank, index);
+ if (unlikely(q_index >= queue->range))
+ q_index = queue->range - 1;
+
+ bucket = &queue->buckets[q_index];
+ if (likely(!bucket->tail)) {
+ bucket->head = bucket->tail = xdpf;
+ pifo_set_bit(queue, q_index);
+ } else {
+ bucket->tail->next = xdpf;
+ bucket->tail = xdpf;
+ }
+
+ pifo->num_queued++;
+ bucket->frame_count++;
+ return 0;
+}
+
+static unsigned long pifo_find_first_bucket(struct bpf_pifo_queue *queue)
+{
+ unsigned long *bitmap, bit = 0, offset;
+ int i;
+
+ for (i = 0; i < queue->levels; i++) {
+ bitmap = queue->lvl_bitmap[i] + bit;
+ if (!*bitmap)
+ return -1;
+ offset = bit;
+ bit = __ffs(*bitmap);
+ }
+ return offset * BITS_PER_TYPE(long) + bit;
+}
+
+struct xdp_frame *pifo_map_dequeue(struct bpf_map *map, u64 flags)
+{
+ struct bpf_pifo_map *pifo = container_of(map, struct bpf_pifo_map, map);
+ struct bpf_pifo_queue *queue = pifo->queue;
+ struct bpf_pifo_bucket *bucket;
+ unsigned long bucket_idx;
+ struct xdp_frame *xdpf;
+
+ /* FIXME: How to return an error different from NULL here? */
+ if (flags)
+ return NULL;
+
+ bucket_idx = pifo_find_first_bucket(queue);
+ if (bucket_idx == -1)
+ return NULL;
+ bucket = &queue->buckets[bucket_idx];
+
+ if (WARN_ON_ONCE(!bucket->tail))
+ return NULL;
+
+ xdpf = bucket->head;
+ if (likely(xdpf->next)) {
+ bucket->head = xdpf->next;
+ } else {
+ bucket->tail = NULL;
+ pifo_clear_bit(queue, bucket_idx);
+ }
+ pifo->num_queued--;
+ bucket->frame_count--;
+ return xdpf;
+}
+
+static void *pifo_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_pifo_map *pifo = container_of(map, struct bpf_pifo_map, map);
+ struct bpf_pifo_queue *queue = pifo->queue;
+ struct bpf_pifo_bucket *bucket;
+ u32 rank = *(u32 *)key, idx;
+
+ if (rank < queue->min_rank)
+ return NULL;
+
+ idx = rank - queue->min_rank;
+ if (idx >= queue->range)
+ return NULL;
+
+ bucket = &queue->buckets[idx];
+ return &bucket->frame_count;
+}
+
+static int pifo_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ return -EINVAL;
+}
+
+static int pifo_map_redirect(struct bpf_map *map, u32 index, u64 flags)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
+
+ /* Lower bits of the flags are used as return code on lookup failure */
+ if (unlikely(flags & ~action_mask))
+ return XDP_ABORTED;
+
+ ri->tgt_value = NULL;
+ ri->tgt_index = index;
+ ri->map_id = map->id;
+ ri->map_type = map->map_type;
+ ri->flags = flags;
+ WRITE_ONCE(ri->map, map);
+
+ return XDP_REDIRECT;
+}
+
+static int pifo_map_btf_id;
+const struct bpf_map_ops pifo_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = pifo_map_alloc,
+ .map_free = pifo_map_free,
+ .map_get_next_key = pifo_map_get_next_key,
+ .map_lookup_elem = pifo_map_lookup_elem,
+ .map_update_elem = pifo_map_update_elem,
+ .map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_pifo_map",
+ .map_btf_id = &pifo_map_btf_id,
+ .map_redirect = pifo_map_redirect,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1aafb43f61d1c..14ec48cba52e0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5585,6 +5585,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_map_push_elem)
goto error;
break;
+ case BPF_MAP_TYPE_PIFO:
+ if (func_id != BPF_FUNC_redirect_map)
+ goto error;
+ break;
default:
break;
}
@@ -5626,6 +5630,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
map->map_type != BPF_MAP_TYPE_CPUMAP &&
+ map->map_type != BPF_MAP_TYPE_PIFO &&
map->map_type != BPF_MAP_TYPE_XSKMAP)
goto error;
break;
diff --git a/net/core/filter.c b/net/core/filter.c
index 8e8d3b49c2976..c1beade4cae8a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3984,6 +3984,13 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
err = dev_map_enqueue(fwd, xdp, dev);
}
break;
+ case BPF_MAP_TYPE_PIFO:
+ map = READ_ONCE(ri->map);
+ if (unlikely(!map))
+ err = -EINVAL;
+ else
+ err = pifo_map_enqueue(map, xdp, ri->tgt_index);
+ break;
case BPF_MAP_TYPE_CPUMAP:
err = cpu_map_enqueue(fwd, xdp, dev);
break;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 509eee5f0393d..5099e4dca048a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -907,6 +907,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_INODE_STORAGE,
BPF_MAP_TYPE_TASK_STORAGE,
BPF_MAP_TYPE_BLOOM_FILTER,
+ BPF_MAP_TYPE_PIFO,
};
/* Note that tracing related programs such as