^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) #include <linux/swap_cgroup.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #include <linux/vmalloc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) #include <linux/swapops.h> /* depends on mm.h include */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) static DEFINE_MUTEX(swap_cgroup_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) struct swap_cgroup_ctrl {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) struct page **map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) unsigned long length;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) spinlock_t lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) struct swap_cgroup {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) unsigned short id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * SwapCgroup implements "lookup" and "exchange" operations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * against SwapCache. At swap_free(), this is accessed directly from swap.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * This means,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * - we have no race in "exchange" when we're accessed via SwapCache because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * SwapCache(and its swp_entry) is under lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * - When called via swap_free(), there is no user of this entry and no race.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) * Then, we don't need lock around "exchange".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) * TODO: we can push these buffers out to HIGHMEM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) * allocate buffer for swap_cgroup.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) static int swap_cgroup_prepare(int type)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) struct swap_cgroup_ctrl *ctrl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) unsigned long idx, max;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) ctrl = &swap_cgroup_ctrl[type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) for (idx = 0; idx < ctrl->length; idx++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) page = alloc_page(GFP_KERNEL | __GFP_ZERO);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) goto not_enough_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) ctrl->map[idx] = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) if (!(idx % SWAP_CLUSTER_MAX))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) not_enough_page:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) max = idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) for (idx = 0; idx < max; idx++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) __free_page(ctrl->map[idx]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) pgoff_t offset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) struct page *mappage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) struct swap_cgroup *sc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) mappage = ctrl->map[offset / SC_PER_PAGE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) sc = page_address(mappage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) return sc + offset % SC_PER_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) struct swap_cgroup_ctrl **ctrlp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) pgoff_t offset = swp_offset(ent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) struct swap_cgroup_ctrl *ctrl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) ctrl = &swap_cgroup_ctrl[swp_type(ent)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) if (ctrlp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) *ctrlp = ctrl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) return __lookup_swap_cgroup(ctrl, offset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) * @ent: swap entry to be cmpxchged
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) * @old: old id
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) * @new: new id
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) * Returns old id at success, 0 at failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) * (There is no mem_cgroup using 0 as its id)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) unsigned short old, unsigned short new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) struct swap_cgroup_ctrl *ctrl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) struct swap_cgroup *sc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) unsigned short retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) sc = lookup_swap_cgroup(ent, &ctrl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) spin_lock_irqsave(&ctrl->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) retval = sc->id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) if (retval == old)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) sc->id = new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) retval = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) spin_unlock_irqrestore(&ctrl->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) return retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) * swap_cgroup_record - record mem_cgroup for a set of swap entries
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) * @ent: the first swap entry to be recorded into
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) * @id: mem_cgroup to be recorded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) * @nr_ents: number of swap entries to be recorded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) * Returns old value at success, 0 at failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) * (Of course, old value can be 0.)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) unsigned int nr_ents)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) struct swap_cgroup_ctrl *ctrl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) struct swap_cgroup *sc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) unsigned short old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) pgoff_t offset = swp_offset(ent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) pgoff_t end = offset + nr_ents;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) sc = lookup_swap_cgroup(ent, &ctrl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) spin_lock_irqsave(&ctrl->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) old = sc->id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) VM_BUG_ON(sc->id != old);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) sc->id = id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) offset++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) if (offset == end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) if (offset % SC_PER_PAGE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) sc++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) sc = __lookup_swap_cgroup(ctrl, offset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) spin_unlock_irqrestore(&ctrl->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) return old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) * @ent: swap entry to be looked up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) return lookup_swap_cgroup(ent, NULL)->id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) int swap_cgroup_swapon(int type, unsigned long max_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) void *array;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) unsigned long array_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) unsigned long length;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) struct swap_cgroup_ctrl *ctrl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) array_size = length * sizeof(void *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) array = vzalloc(array_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) if (!array)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) goto nomem;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) ctrl = &swap_cgroup_ctrl[type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) mutex_lock(&swap_cgroup_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) ctrl->length = length;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) ctrl->map = array;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) spin_lock_init(&ctrl->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) if (swap_cgroup_prepare(type)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) /* memory shortage */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) ctrl->map = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) ctrl->length = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) mutex_unlock(&swap_cgroup_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) vfree(array);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) goto nomem;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) mutex_unlock(&swap_cgroup_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) nomem:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) pr_info("couldn't allocate enough memory for swap_cgroup\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) void swap_cgroup_swapoff(int type)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) struct page **map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) unsigned long i, length;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) struct swap_cgroup_ctrl *ctrl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) mutex_lock(&swap_cgroup_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) ctrl = &swap_cgroup_ctrl[type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) map = ctrl->map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) length = ctrl->length;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) ctrl->map = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) ctrl->length = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) mutex_unlock(&swap_cgroup_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) if (map) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) for (i = 0; i < length; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) struct page *page = map[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) __free_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) if (!(i % SWAP_CLUSTER_MAX))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) vfree(map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) }