^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0+
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Restartable sequences system call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 2015, Google, Inc.,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Copyright (C) 2015-2018, EfficiOS Inc.,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/uaccess.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/syscalls.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/rseq.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <asm/ptrace.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #define CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <trace/events/rseq.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * Restartable sequences are a lightweight interface that allows
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * user-level code to be executed atomically relative to scheduler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * preemption and signal delivery. Typically used for implementing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * per-cpu operations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) * It allows user-space to perform update operations on per-cpu data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) * without requiring heavy-weight atomic operations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) * Detailed algorithm of rseq user-space assembly sequences:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) * init(rseq_cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) * cpu = TLS->rseq::cpu_id_start
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) * [1] TLS->rseq::rseq_cs = rseq_cs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) * [start_ip] ----------------------------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) * [2] if (cpu != TLS->rseq::cpu_id)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) * goto abort_ip;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) * [3] <last_instruction_in_cs>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * [post_commit_ip] ----------------------------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * The address of jump target abort_ip must be outside the critical
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * region, i.e.:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) * Steps [2]-[3] (inclusive) need to be a sequence of instructions in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) * userspace that can handle being interrupted between any of those
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * instructions, and then resumed to the abort_ip.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * 1. Userspace stores the address of the struct rseq_cs assembly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * block descriptor into the rseq_cs field of the registered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) * struct rseq TLS area. This update is performed through a single
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) * store within the inline assembly instruction sequence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) * [start_ip]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) * 2. Userspace tests to check whether the current cpu_id field match
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) * the cpu number loaded before start_ip, branching to abort_ip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) * in case of a mismatch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) * If the sequence is preempted or interrupted by a signal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) * at or after start_ip and before post_commit_ip, then the kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) * ip to abort_ip before returning to user-space, so the preempted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * execution resumes at abort_ip.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * 3. Userspace critical section final instruction before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) * post_commit_ip is the commit. The critical section is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) * self-terminating.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) * [post_commit_ip]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * 4. <success>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) * On failure at [2], or if interrupted by preempt or signal delivery
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) * between [1] and [3]:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) * [abort_ip]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) * F1. <failure>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) static int rseq_update_cpu_id(struct task_struct *t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) u32 cpu_id = raw_smp_processor_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) if (put_user(cpu_id, &t->rseq->cpu_id_start))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) if (put_user(cpu_id, &t->rseq->cpu_id))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) trace_rseq_update(t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) static int rseq_reset_rseq_cpu_id(struct task_struct *t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) * Reset cpu_id_start to its initial state (0).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) if (put_user(cpu_id_start, &t->rseq->cpu_id_start))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) * in after unregistration can figure out that rseq needs to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) * registered again.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) if (put_user(cpu_id, &t->rseq->cpu_id))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) struct rseq_cs __user *urseq_cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) u64 ptr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) u32 __user *usig;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) u32 sig;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) #ifdef CONFIG_64BIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) if (get_user(ptr, &t->rseq->rseq_cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) if (!ptr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) memset(rseq_cs, 0, sizeof(*rseq_cs));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) if (ptr >= TASK_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) if (rseq_cs->start_ip >= TASK_SIZE ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) rseq_cs->abort_ip >= TASK_SIZE ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) rseq_cs->version > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) /* Check for overflow. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) /* Ensure that abort_ip is not in the critical section. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) ret = get_user(sig, usig);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) if (current->rseq_sig != sig) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) printk_ratelimited(KERN_WARNING
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) sig, current->rseq_sig, current->pid, usig);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) u32 flags, event_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) /* Get thread flags. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) ret = get_user(flags, &t->rseq->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) /* Take critical section flags into account. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) flags |= cs_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) * Restart on signal can only be inhibited when restart on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) * preempt and restart on migrate are inhibited too. Otherwise,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) * a preempted signal handler could fail to restart the prior
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) * execution context on sigreturn.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) !=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) RSEQ_CS_PREEMPT_MIGRATE_FLAGS))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) * Load and clear event mask atomically with respect to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) * scheduler preemption.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) preempt_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) event_mask = t->rseq_event_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) t->rseq_event_mask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) preempt_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) return !!(event_mask & ~flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) static int clear_rseq_cs(struct task_struct *t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) * The rseq_cs field is set to NULL on preemption or signal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) * delivery on top of rseq assembly block, as well as on top
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) * of code outside of the rseq assembly block. This performs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) * a lazy clear of the rseq_cs field.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) * Set rseq_cs to NULL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) #ifdef CONFIG_64BIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) return put_user(0UL, &t->rseq->rseq_cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) * Unsigned comparison will be true when ip >= start_ip, and when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) * ip < start_ip + post_commit_offset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) static int rseq_ip_fixup(struct pt_regs *regs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) unsigned long ip = instruction_pointer(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) struct task_struct *t = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) struct rseq_cs rseq_cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) ret = rseq_get_rseq_cs(t, &rseq_cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) * Handle potentially not being within a critical section.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) * If not nested over a rseq critical section, restart is useless.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) * Clear the rseq_cs pointer and return.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) if (!in_rseq_cs(ip, &rseq_cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) return clear_rseq_cs(t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) ret = rseq_need_restart(t, rseq_cs.flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) if (ret <= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) ret = clear_rseq_cs(t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) rseq_cs.abort_ip);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) * This resume handler must always be executed between any of:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) * - preemption,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) * - signal delivery,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) * and return to user-space.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) * This is how we can ensure that the entire rseq critical section
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) * will issue the commit instruction only if executed atomically with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) * respect to other threads scheduled on the same CPU, and with respect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) * to signal handlers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) struct task_struct *t = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) int ret, sig;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) if (unlikely(t->flags & PF_EXITING))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) if (unlikely(!access_ok(t->rseq, sizeof(*t->rseq))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) goto error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) * regs is NULL if and only if the caller is in a syscall path. Skip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) * kill a misbehaving userspace on debug kernels.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) if (regs) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) ret = rseq_ip_fixup(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) if (unlikely(ret < 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) goto error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) if (unlikely(rseq_update_cpu_id(t)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) goto error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) error:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) sig = ksig ? ksig->sig : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) force_sigsegv(sig);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) #ifdef CONFIG_DEBUG_RSEQ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) * Terminate the process if a syscall is issued within a restartable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) * sequence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) void rseq_syscall(struct pt_regs *regs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) unsigned long ip = instruction_pointer(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) struct task_struct *t = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) struct rseq_cs rseq_cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) if (!t->rseq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) if (!access_ok(t->rseq, sizeof(*t->rseq)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) force_sig(SIGSEGV);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) * sys_rseq - setup restartable sequences for caller thread.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) int, flags, u32, sig)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) if (flags & RSEQ_FLAG_UNREGISTER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) if (flags & ~RSEQ_FLAG_UNREGISTER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) /* Unregister rseq for current thread. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) if (current->rseq != rseq || !current->rseq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) if (rseq_len != sizeof(*rseq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) if (current->rseq_sig != sig)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) return -EPERM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) ret = rseq_reset_rseq_cpu_id(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) current->rseq = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) current->rseq_sig = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) if (unlikely(flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) if (current->rseq) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) * If rseq is already registered, check whether
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) * the provided address differs from the prior
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) * one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) if (current->rseq != rseq || rseq_len != sizeof(*rseq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) if (current->rseq_sig != sig)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) return -EPERM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) /* Already registered. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) * If there was no rseq previously registered,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) * ensure the provided rseq is properly aligned and valid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) rseq_len != sizeof(*rseq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) if (!access_ok(rseq, rseq_len))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) current->rseq = rseq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) current->rseq_sig = sig;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) * If rseq was previously inactive, and has just been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) * registered, ensure the cpu_id_start and cpu_id fields
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) * are updated before returning to user-space.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) rseq_set_notify_resume(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) }