^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) #include <linux/percpu.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) #include <linux/osq_lock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * An MCS like lock especially tailored for optimistic spinning for sleeping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * lock implementations (mutex, rwsem, etc).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * Using a single mcs node per CPU is safe because sleeping locks should not be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * called from interrupt context and we have preemption disabled while
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * spinning.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * We use the value 0 to represent "no CPU", thus the encoded value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * will be the CPU number incremented by 1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) static inline int encode_cpu(int cpu_nr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) return cpu_nr + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) static inline int node_cpu(struct optimistic_spin_node *node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) return node->cpu - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) int cpu_nr = encoded_cpu_val - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) return per_cpu_ptr(&osq_node, cpu_nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) * Can return NULL in case we were the last queued and we updated @lock instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) static inline struct optimistic_spin_node *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) osq_wait_next(struct optimistic_spin_queue *lock,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) struct optimistic_spin_node *node,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) struct optimistic_spin_node *prev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) struct optimistic_spin_node *next = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) int curr = encode_cpu(smp_processor_id());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) int old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) * If there is a prev node in queue, then the 'old' value will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) * we're currently last in queue, then the queue will then become empty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) if (atomic_read(&lock->tail) == curr &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) * We were the last queued, we moved @lock back. @prev
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) * will now observe @lock and will complete its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) * unlock()/unqueue().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) * We must xchg() the @node->next value, because if we were to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * leave it in, a concurrent unlock()/unqueue() from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) * @node->next might complete Step-A and think its @prev is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) * still valid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) * If the concurrent unlock()/unqueue() wins the race, we'll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * wait for either @lock to point to us, through its Step-B, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) * wait for a new @node->next from its Step-C.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) if (node->next) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) next = xchg(&node->next, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) if (next)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) cpu_relax();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) return next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) bool osq_lock(struct optimistic_spin_queue *lock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) struct optimistic_spin_node *prev, *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) int curr = encode_cpu(smp_processor_id());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) int old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) node->locked = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) node->next = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) node->cpu = curr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) * We need both ACQUIRE (pairs with corresponding RELEASE in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) * unlock() uncontended, or fastpath) and RELEASE (to publish
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) * the node fields we just initialised) semantics when updating
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) * the lock tail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) old = atomic_xchg(&lock->tail, curr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) if (old == OSQ_UNLOCKED_VAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) prev = decode_cpu(old);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) node->prev = prev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) * osq_lock() unqueue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) * node->prev = prev osq_wait_next()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) * WMB MB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) * prev->next = node next->prev = prev // unqueue-C
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) * Here 'node->prev' and 'next->prev' are the same variable and we need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) * to ensure these stores happen in-order to avoid corrupting the list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) WRITE_ONCE(prev->next, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) * Normally @prev is untouchable after the above store; because at that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) * moment unlock can proceed and wipe the node element from stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) * However, since our nodes are static per-cpu storage, we're
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) * guaranteed their existence -- this allows us to apply
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) * cmpxchg in an attempt to undo our queueing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) * Wait to acquire the lock or cancelation. Note that need_resched()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) * will come with an IPI, which will wake smp_cond_load_relaxed() if it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) * is implemented with a monitor-wait. vcpu_is_preempted() relies on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) * polling, be careful.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) vcpu_is_preempted(node_cpu(node->prev))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) /* unqueue */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) * Step - A -- stabilize @prev
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) * Undo our @prev->next assignment; this will make @prev's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) * unlock()/unqueue() wait for a next pointer since @lock points to us
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) * (or later).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) * cpu_relax() below implies a compiler barrier which would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) * prevent this comparison being optimized away.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) if (data_race(prev->next) == node &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) cmpxchg(&prev->next, node, NULL) == node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) * We can only fail the cmpxchg() racing against an unlock(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) * in which case we should observe @node->locked becomming
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) * true.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) if (smp_load_acquire(&node->locked))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) cpu_relax();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) * Or we race against a concurrent unqueue()'s step-B, in which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) * case its step-C will write us a new @node->prev pointer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) prev = READ_ONCE(node->prev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) * Step - B -- stabilize @next
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) * Similar to unlock(), wait for @node->next or move @lock from @node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) * back to @prev.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) next = osq_wait_next(lock, node, prev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) if (!next)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) * Step - C -- unlink
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) * @prev is stable because its still waiting for a new @prev->next
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) * pointer, @next is stable because our @node->next pointer is NULL and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) * it will wait in Step-A.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) WRITE_ONCE(next->prev, prev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) WRITE_ONCE(prev->next, next);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) void osq_unlock(struct optimistic_spin_queue *lock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) struct optimistic_spin_node *node, *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) int curr = encode_cpu(smp_processor_id());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) * Fast path for the uncontended case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) if (likely(atomic_cmpxchg_release(&lock->tail, curr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) OSQ_UNLOCKED_VAL) == curr))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) * Second most likely case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) node = this_cpu_ptr(&osq_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) next = xchg(&node->next, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) if (next) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) WRITE_ONCE(next->locked, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) next = osq_wait_next(lock, node, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) if (next)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) WRITE_ONCE(next->locked, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) }