^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Copyright (c) 2018 Facebook
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) #include <linux/bpf.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) #include <linux/err.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #include <linux/sock_diag.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <net/sock_reuseport.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) struct reuseport_array {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) struct bpf_map map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) struct sock __rcu *ptrs[];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) static struct reuseport_array *reuseport_array(struct bpf_map *map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) return (struct reuseport_array *)map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) /* The caller must hold the reuseport_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) void bpf_sk_reuseport_detach(struct sock *sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) uintptr_t sk_user_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) write_lock_bh(&sk->sk_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) sk_user_data = (uintptr_t)sk->sk_user_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) if (sk_user_data & SK_USER_DATA_BPF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) struct sock __rcu **socks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) WRITE_ONCE(sk->sk_user_data, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) * Do not move this NULL assignment outside of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) * sk->sk_callback_lock because there is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) * a race with reuseport_array_free()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) * which does not hold the reuseport_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) RCU_INIT_POINTER(*socks, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) write_unlock_bh(&sk->sk_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) static int reuseport_array_alloc_check(union bpf_attr *attr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) if (attr->value_size != sizeof(u32) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) attr->value_size != sizeof(u64))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) return array_map_alloc_check(attr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) struct reuseport_array *array = reuseport_array(map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) u32 index = *(u32 *)key;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) if (unlikely(index >= array->map.max_entries))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) return rcu_dereference(array->ptrs[index]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) /* Called from syscall only */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) struct reuseport_array *array = reuseport_array(map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) u32 index = *(u32 *)key;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) struct sock *sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) if (index >= map->max_entries)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) return -E2BIG;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) if (!rcu_access_pointer(array->ptrs[index]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) return -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) spin_lock_bh(&reuseport_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) sk = rcu_dereference_protected(array->ptrs[index],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) lockdep_is_held(&reuseport_lock));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) if (sk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) write_lock_bh(&sk->sk_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) WRITE_ONCE(sk->sk_user_data, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) RCU_INIT_POINTER(array->ptrs[index], NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) write_unlock_bh(&sk->sk_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) err = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) spin_unlock_bh(&reuseport_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) static void reuseport_array_free(struct bpf_map *map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) struct reuseport_array *array = reuseport_array(map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) struct sock *sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) u32 i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) * ops->map_*_elem() will not be able to access this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) * array now. Hence, this function only races with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) * bpf_sk_reuseport_detach() which was triggerred by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) * close() or disconnect().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) * This function and bpf_sk_reuseport_detach() are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) * both removing sk from "array". Who removes it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) * first does not matter.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) * The only concern here is bpf_sk_reuseport_detach()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) * may access "array" which is being freed here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) * bpf_sk_reuseport_detach() access this "array"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) * through sk->sk_user_data _and_ with sk->sk_callback_lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * held which is enough because this "array" is not freed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) * until all sk->sk_user_data has stopped referencing this "array".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) * Hence, due to the above, taking "reuseport_lock" is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) * needed here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) * Since reuseport_lock is not taken, sk is accessed under
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) * rcu_read_lock()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) for (i = 0; i < map->max_entries; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) sk = rcu_dereference(array->ptrs[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) if (sk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) write_lock_bh(&sk->sk_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) * No need for WRITE_ONCE(). At this point,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) * no one is reading it without taking the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) * sk->sk_callback_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) sk->sk_user_data = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) write_unlock_bh(&sk->sk_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) RCU_INIT_POINTER(array->ptrs[i], NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) * Once reaching here, all sk->sk_user_data is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) * referenceing this "array". "array" can be freed now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) bpf_map_area_free(array);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) int err, numa_node = bpf_map_attr_numa_node(attr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) struct reuseport_array *array;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) struct bpf_map_memory mem;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) u64 array_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) if (!bpf_capable())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) return ERR_PTR(-EPERM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) array_size = sizeof(*array);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) array_size += (u64)attr->max_entries * sizeof(struct sock *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) err = bpf_map_charge_init(&mem, array_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) return ERR_PTR(err);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) /* allocate all map elements and zero-initialize them */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) array = bpf_map_area_alloc(array_size, numa_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) if (!array) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) bpf_map_charge_finish(&mem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) return ERR_PTR(-ENOMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) /* copy mandatory map attributes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) bpf_map_init_from_attr(&array->map, attr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) bpf_map_charge_move(&array->map.memory, &mem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) return &array->map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) void *value)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) struct sock *sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) if (map->value_size != sizeof(u64))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) sk = reuseport_array_lookup_elem(map, key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) if (sk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) *(u64 *)value = __sock_gen_cookie(sk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) err = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) static int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) reuseport_array_update_check(const struct reuseport_array *array,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) const struct sock *nsk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) const struct sock *osk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) const struct sock_reuseport *nsk_reuse,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) u32 map_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) if (osk && map_flags == BPF_NOEXIST)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) return -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) if (!osk && map_flags == BPF_EXIST)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) return -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) return -ENOTSUPP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) return -ENOTSUPP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) return -ENOTSUPP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) * sk must be hashed (i.e. listening in the TCP case or binded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) * in the UDP case) and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) * Also, sk will be used in bpf helper that is protected by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) * rcu_read_lock().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) /* READ_ONCE because the sk->sk_callback_lock may not be held here */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) if (READ_ONCE(nsk->sk_user_data))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) * Called from syscall only.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) * The "nsk" in the fd refcnt.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) * The "osk" and "reuse" are protected by reuseport_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) void *value, u64 map_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) struct reuseport_array *array = reuseport_array(map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) struct sock *free_osk = NULL, *osk, *nsk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) struct sock_reuseport *reuse;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) u32 index = *(u32 *)key;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) uintptr_t sk_user_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) struct socket *socket;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) int err, fd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) if (map_flags > BPF_EXIST)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) if (index >= map->max_entries)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) return -E2BIG;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) if (map->value_size == sizeof(u64)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) u64 fd64 = *(u64 *)value;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) if (fd64 > S32_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) fd = fd64;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) fd = *(int *)value;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) socket = sockfd_lookup(fd, &err);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) if (!socket)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) nsk = socket->sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) if (!nsk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) goto put_file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) /* Quick checks before taking reuseport_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) err = reuseport_array_update_check(array, nsk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) rcu_access_pointer(array->ptrs[index]),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) rcu_access_pointer(nsk->sk_reuseport_cb),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) map_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) goto put_file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) spin_lock_bh(&reuseport_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) * Some of the checks only need reuseport_lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) * but it is done under sk_callback_lock also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) * for simplicity reason.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) write_lock_bh(&nsk->sk_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) osk = rcu_dereference_protected(array->ptrs[index],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) lockdep_is_held(&reuseport_lock));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) reuse = rcu_dereference_protected(nsk->sk_reuseport_cb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) lockdep_is_held(&reuseport_lock));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) goto put_file_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) SK_USER_DATA_BPF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) rcu_assign_pointer(array->ptrs[index], nsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) free_osk = osk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) put_file_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) write_unlock_bh(&nsk->sk_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) if (free_osk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) write_lock_bh(&free_osk->sk_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) WRITE_ONCE(free_osk->sk_user_data, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) write_unlock_bh(&free_osk->sk_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) spin_unlock_bh(&reuseport_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) put_file:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) fput(socket->file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) /* Called from syscall */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) void *next_key)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) struct reuseport_array *array = reuseport_array(map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) u32 index = key ? *(u32 *)key : U32_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) u32 *next = (u32 *)next_key;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) if (index >= array->map.max_entries) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) *next = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) if (index == array->map.max_entries - 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) return -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) *next = index + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) static int reuseport_array_map_btf_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) const struct bpf_map_ops reuseport_array_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) .map_meta_equal = bpf_map_meta_equal,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) .map_alloc_check = reuseport_array_alloc_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) .map_alloc = reuseport_array_alloc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) .map_free = reuseport_array_free,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) .map_lookup_elem = reuseport_array_lookup_elem,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) .map_get_next_key = reuseport_array_get_next_key,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) .map_delete_elem = reuseport_array_delete_elem,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) .map_btf_name = "reuseport_array",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) .map_btf_id = &reuseport_array_map_btf_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) };