BACKPORT: xdp: Add devmap_hash map type for looking up devices by hashed index

A common pattern when using xdp_redirect_map() is to create a device map
where the lookup key is simply ifindex. Because device maps are arrays,
this leaves holes in the map, and the map has to be sized to fit the
largest ifindex, regardless of how many devices actually are actually
needed in the map.

This patch adds a second type of device map where the key is looked up
using a hashmap, instead of being used as an array index. This allows maps
to be densely packed, so they can be smaller.

Change-Id: I6155de499a47fb45bac1a39319f0ad979032fd6d
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Toke Høiland-Jørgensen 6 years ago committed by Ruchit
parent b3f51f0024
commit bc7b2dee9b
  1. 7
      include/linux/bpf.h
  2. 1
      include/linux/bpf_types.h
  3. 1
      include/uapi/linux/bpf.h
  4. 208
      kernel/bpf/devmap.c
  5. 4
      kernel/bpf/verifier.c
  6. 6
      net/core/filter.c

@ -412,6 +412,7 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
/* Map specifics */
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
void __dev_map_flush(struct bpf_map *map);
@ -484,6 +485,12 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map,
return NULL;
}
static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map,
u32 key)
{
return NULL;
}
static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
{
}

@ -40,6 +40,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
#ifdef CONFIG_NET
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
#ifdef CONFIG_STREAM_PARSER
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
#endif

@ -114,6 +114,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH_OF_MAPS,
BPF_MAP_TYPE_DEVMAP,
BPF_MAP_TYPE_SOCKMAP,
BPF_MAP_TYPE_DEVMAP_HASH = 25,
};
enum bpf_prog_type {

@ -46,6 +46,12 @@
* notifier hook walks the map we know that new dev references can not be
* added by the user because core infrastructure ensures dev_get_by_index()
* calls will fail at this point.
*
* The devmap_hash type is a map type which interprets keys as ifindexes and
* indexes these using a hashmap. This allows maps that use ifindex as key to be
* densely packed instead of having holes in the lookup array for unused
* ifindexes. The setup and packet enqueue/send code is shared between the two
* types of devmap; only the lookup and insertion is different.
*/
#include <linux/bpf.h>
#include <linux/filter.h>
@ -55,6 +61,7 @@
struct bpf_dtab_netdev {
struct net_device *dev;
struct hlist_node index_hlist;
struct bpf_dtab *dtab;
unsigned int bit;
struct rcu_head rcu;
@ -65,11 +72,30 @@ struct bpf_dtab {
struct bpf_dtab_netdev **netdev_map;
unsigned long __percpu *flush_needed;
struct list_head list;
/* these are only used for DEVMAP_HASH type maps */
struct hlist_head *dev_index_head;
spinlock_t index_lock;
unsigned int items;
u32 n_buckets;
};
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
static struct hlist_head *dev_map_create_hash(unsigned int entries)
{
int i;
struct hlist_head *hash;
hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL);
if (hash != NULL)
for (i = 0; i < entries; i++)
INIT_HLIST_HEAD(&hash[i]);
return hash;
}
static u64 dev_map_bitmap_size(const union bpf_attr *attr)
{
return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
@ -109,6 +135,16 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
if (!dtab->n_buckets) { /* Overflow check */
err = -EINVAL;
goto free_dtab;
}
cost += sizeof(struct hlist_head) * dtab->n_buckets;
}
/* if map size is larger than memlock limit, reject it early */
err = bpf_map_precharge_memlock(dtab->map.pages);
if (err)
@ -129,13 +165,24 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (!dtab->netdev_map)
goto free_dtab;
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
if (!dtab->dev_index_head)
goto free_map_area;
spin_lock_init(&dtab->index_lock);
}
spin_lock(&dev_map_lock);
list_add_tail_rcu(&dtab->list, &dev_map_list);
spin_unlock(&dev_map_lock);
return &dtab->map;
free_map_area:
bpf_map_area_free(dtab->netdev_map);
free_dtab:
free_percpu(dtab->flush_needed);
kfree(dtab->dev_index_head);
kfree(dtab);
return ERR_PTR(err);
}
@ -187,6 +234,7 @@ static void dev_map_free(struct bpf_map *map)
free_percpu(dtab->flush_needed);
bpf_map_area_free(dtab->netdev_map);
kfree(dtab->dev_index_head);
kfree(dtab);
}
@ -207,6 +255,77 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
int idx)
{
return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
}
static struct bpf_dtab_netdev *__dev_map_hash_lookup_elem_dtab(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct hlist_head *head = dev_map_index_hash(dtab, key);
struct bpf_dtab_netdev *dev;
hlist_for_each_entry_rcu(dev, head, index_hlist)
if (dev->bit == key)
return dev;
return NULL;
}
struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab_netdev *dev = __dev_map_hash_lookup_elem_dtab(map, key);
return dev ? dev->dev : NULL;
}
static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
void *next_key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
u32 idx, *next = next_key;
struct bpf_dtab_netdev *dev, *next_dev;
struct hlist_head *head;
int i = 0;
if (!key)
goto find_first;
idx = *(u32 *)key;
dev = __dev_map_hash_lookup_elem_dtab(map, idx);
if (!dev)
goto find_first;
next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
struct bpf_dtab_netdev, index_hlist);
if (next_dev) {
*next = next_dev->bit;
return 0;
}
i = idx & (dtab->n_buckets - 1);
i++;
find_first:
for (; i < dtab->n_buckets; i++) {
head = dev_map_index_hash(dtab, i);
next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
struct bpf_dtab_netdev,
index_hlist);
if (next_dev) {
*next = next_dev->bit;
return 0;
}
}
return -ENOENT;
}
void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@ -268,6 +387,13 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
return dev ? &dev->ifindex : NULL;
}
static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
{
struct net_device *dev = __dev_map_hash_lookup_elem(map, *(u32 *)key);
return dev ? &dev->ifindex : NULL;
}
static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
{
if (dev->dev->netdev_ops->ndo_xdp_flush) {
@ -317,6 +443,28 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
int k = *(u32 *)key;
unsigned long flags;
int ret = -ENOENT;
spin_lock_irqsave(&dtab->index_lock, flags);
old_dev = __dev_map_hash_lookup_elem_dtab(map, k);
if (old_dev) {
dtab->items--;
hlist_del_init_rcu(&old_dev->index_hlist);
call_rcu(&old_dev->rcu, __dev_map_entry_free);
ret = 0;
}
spin_unlock_irqrestore(&dtab->index_lock, flags);
return ret;
}
static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_dtab *dtab,
u32 ifindex,
@ -376,6 +524,57 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
return 0;
}
static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
u32 ifindex = *(u32 *)value;
u32 idx = *(u32 *)key;
unsigned long flags;
if (unlikely(map_flags > BPF_EXIST || !ifindex))
return -EINVAL;
old_dev = __dev_map_hash_lookup_elem_dtab(map, idx);
if (old_dev && (map_flags & BPF_NOEXIST))
return -EEXIST;
dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
if (IS_ERR(dev))
return PTR_ERR(dev);
spin_lock_irqsave(&dtab->index_lock, flags);
if (old_dev) {
hlist_del_rcu(&old_dev->index_hlist);
} else {
if (dtab->items >= dtab->map.max_entries) {
spin_unlock_irqrestore(&dtab->index_lock, flags);
call_rcu(&dev->rcu, __dev_map_entry_free);
return -E2BIG;
}
dtab->items++;
}
hlist_add_head_rcu(&dev->index_hlist,
dev_map_index_hash(dtab, idx));
spin_unlock_irqrestore(&dtab->index_lock, flags);
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
return 0;
}
static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
return __dev_map_hash_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
const struct bpf_map_ops dev_map_ops = {
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
@ -385,6 +584,15 @@ const struct bpf_map_ops dev_map_ops = {
.map_delete_elem = dev_map_delete_elem,
};
const struct bpf_map_ops dev_map_hash_ops = {
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_hash_get_next_key,
.map_lookup_elem = dev_map_hash_lookup_elem,
.map_update_elem = dev_map_hash_update_elem,
.map_delete_elem = dev_map_hash_delete_elem,
};
static int dev_map_notification(struct notifier_block *notifier,
ulong event, void *ptr)
{

@ -1768,6 +1768,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
* for now.
*/
case BPF_MAP_TYPE_DEVMAP:
case BPF_MAP_TYPE_DEVMAP_HASH:
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
@ -1807,7 +1808,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
goto error;
break;
case BPF_FUNC_redirect_map:
if (map->map_type != BPF_MAP_TYPE_DEVMAP)
if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
map->map_type != BPF_MAP_TYPE_DEVMAP_HASH)
goto error;
break;
case BPF_FUNC_sk_redirect_map:

@ -2594,7 +2594,10 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
goto err;
}
if (map->map_type == BPF_MAP_TYPE_DEVMAP)
fwd = __dev_map_lookup_elem(map, index);
else if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
fwd = __dev_map_hash_lookup_elem(map, index);
if (!fwd) {
err = -EINVAL;
goto err;
@ -2665,7 +2668,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
map = NULL;
goto err;
}
if (map->map_type == BPF_MAP_TYPE_DEVMAP)
fwd = __dev_map_lookup_elem(map, index);
else if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
fwd = __dev_map_hash_lookup_elem(map, index);
} else {
fwd = dev_get_by_index_rcu(dev_net(dev), index);
}

Loading…
Cancel
Save