^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-or-later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /* -*- mode: c; c-basic-offset: 8; -*-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * vim: noexpandtab sw=8 ts=8 sts=0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Copyright (C) 2005 Oracle. All rights reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) /* This quorum hack is only here until we transition to some more rational
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * approach that is driven from userspace. Honest. No foolin'.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * Imagine two nodes lose network connectivity to each other but they're still
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * up and operating in every other way. Presumably a network timeout indicates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * that a node is broken and should be recovered. They can't both recover each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * other and both carry on without serialising their access to the file system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * They need to decide who is authoritative. Now extend that problem to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * arbitrary groups of nodes losing connectivity between each other.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * So we declare that a node which has given up on connecting to a majority
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * of nodes who are still heartbeating will fence itself.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * There are huge opportunities for races here. After we give up on a node's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * connection we need to wait long enough to give heartbeat an opportunity
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * to declare the node as truly dead. We also need to be careful with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * race between when we see a node start heartbeating and when we connect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * to it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * So nodes that are in this transtion put a hold on the quorum decision
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * with a counter. As they fall out of this transition they drop the count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * and if they're the last, they fire off the decision.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/workqueue.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/reboot.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include "heartbeat.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include "nodemanager.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #define MLOG_MASK_PREFIX ML_QUORUM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include "masklog.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include "quorum.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) static struct o2quo_state {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) spinlock_t qs_lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) struct work_struct qs_work;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) int qs_pending;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) int qs_heartbeating;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) int qs_connected;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) int qs_holds;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) } o2quo_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) /* this is horribly heavy-handed. It should instead flip the file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * system RO and call some userspace script. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) static void o2quo_fence_self(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) /* panic spins with interrupts enabled. with preempt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) * threads can still schedule, etc, etc */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) o2hb_stop_all_regions();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) switch (o2nm_single_cluster->cl_fence_method) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) case O2NM_FENCE_PANIC:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) panic("*** ocfs2 is very sorry to be fencing this system by "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) "panicing ***\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) WARN_ON(o2nm_single_cluster->cl_fence_method >=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) O2NM_FENCE_METHODS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) fallthrough;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) case O2NM_FENCE_RESET:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) "system by restarting ***\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) emergency_restart();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) /* Indicate that a timeout occurred on a heartbeat region write. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) * other nodes in the cluster may consider us dead at that time so we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) * want to "fence" ourselves so that we don't scribble on the disk
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) * after they think they've recovered us. This can't solve all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) * problems related to writeout after recovery but this hack can at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) * least close some of those gaps. When we have real fencing, this can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) * go away as our node would be fenced externally before other nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) * begin recovery. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) void o2quo_disk_timeout(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) o2quo_fence_self();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) static void o2quo_make_decision(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) int quorum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) int lowest_hb, lowest_reachable = 0, fence = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) struct o2quo_state *qs = &o2quo_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) spin_lock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) if (lowest_hb != O2NM_MAX_NODES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) mlog(0, "heartbeating: %d, connected: %d, "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) qs->qs_heartbeating == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) if (qs->qs_heartbeating & 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) /* the odd numbered cluster case is straight forward --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) * if we can't talk to the majority we're hosed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) quorum = (qs->qs_heartbeating + 1)/2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) if (qs->qs_connected < quorum) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) mlog(ML_ERROR, "fencing this node because it is "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) "only connected to %u nodes and %u is needed "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) "to make a quorum out of %u heartbeating nodes\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) qs->qs_connected, quorum,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) qs->qs_heartbeating);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) fence = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) /* the even numbered cluster adds the possibility of each half
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) * of the cluster being able to talk amongst themselves.. in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) * that case we're hosed if we can't talk to the group that has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) * the lowest numbered node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) quorum = qs->qs_heartbeating / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) if (qs->qs_connected < quorum) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) mlog(ML_ERROR, "fencing this node because it is "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) "only connected to %u nodes and %u is needed "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) "to make a quorum out of %u heartbeating nodes\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) qs->qs_connected, quorum,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) qs->qs_heartbeating);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) fence = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) else if ((qs->qs_connected == quorum) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) !lowest_reachable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) mlog(ML_ERROR, "fencing this node because it is "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) "connected to a half-quorum of %u out of %u "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) "nodes which doesn't include the lowest active "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) "node %u\n", quorum, qs->qs_heartbeating,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) lowest_hb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) fence = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) if (fence) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) spin_unlock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) o2quo_fence_self();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) "connected: %d, lowest: %d (%sreachable)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) qs->qs_heartbeating, qs->qs_connected, lowest_hb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) lowest_reachable ? "" : "un");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) spin_unlock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) assert_spin_locked(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) if (!test_and_set_bit(node, qs->qs_hold_bm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) qs->qs_holds++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) "node %u\n", node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) mlog(0, "node %u, %d total\n", node, qs->qs_holds);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) assert_spin_locked(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) if (test_and_clear_bit(node, qs->qs_hold_bm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) if (--qs->qs_holds == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) if (qs->qs_pending) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) qs->qs_pending = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) schedule_work(&qs->qs_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) node, qs->qs_holds);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) /* as a node comes up we delay the quorum decision until we know the fate of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) * the connection. the hold will be droped in conn_up or hb_down. it might be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) * perpetuated by con_err until hb_down. if we already have a conn, we might
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) * be dropping a hold that conn_up got. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) void o2quo_hb_up(u8 node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) struct o2quo_state *qs = &o2quo_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) spin_lock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) qs->qs_heartbeating++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) "node %u\n", node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) set_bit(node, qs->qs_hb_bm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) if (!test_bit(node, qs->qs_conn_bm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) o2quo_set_hold(qs, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) o2quo_clear_hold(qs, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) spin_unlock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) /* hb going down releases any holds we might have had due to this node from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) * conn_up, conn_err, or hb_up */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) void o2quo_hb_down(u8 node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) struct o2quo_state *qs = &o2quo_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) spin_lock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) qs->qs_heartbeating--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) mlog_bug_on_msg(qs->qs_heartbeating < 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) "node %u, %d heartbeating\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) node, qs->qs_heartbeating);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) clear_bit(node, qs->qs_hb_bm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) o2quo_clear_hold(qs, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) spin_unlock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) /* this tells us that we've decided that the node is still heartbeating
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) * even though we've lost it's conn. it must only be called after conn_err
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) * and indicates that we must now make a quorum decision in the future,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) * though we might be doing so after waiting for holds to drain. Here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) * we'll be dropping the hold from conn_err. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) void o2quo_hb_still_up(u8 node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) struct o2quo_state *qs = &o2quo_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) spin_lock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) mlog(0, "node %u\n", node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) qs->qs_pending = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) o2quo_clear_hold(qs, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) spin_unlock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) /* This is analogous to hb_up. as a node's connection comes up we delay the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) * quorum decision until we see it heartbeating. the hold will be droped in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) * it's already heartbeating we might be dropping a hold that conn_up got.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) * */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) void o2quo_conn_up(u8 node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) struct o2quo_state *qs = &o2quo_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) spin_lock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) qs->qs_connected++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) "node %u\n", node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) set_bit(node, qs->qs_conn_bm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) mlog(0, "node %u, %d total\n", node, qs->qs_connected);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) if (!test_bit(node, qs->qs_hb_bm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) o2quo_set_hold(qs, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) o2quo_clear_hold(qs, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) spin_unlock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) /* we've decided that we won't ever be connecting to the node again. if it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) * still heartbeating we grab a hold that will delay decisions until either the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) * node stops heartbeating from hb_down or the caller decides that the node is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) * still up and calls still_up */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) void o2quo_conn_err(u8 node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) struct o2quo_state *qs = &o2quo_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) spin_lock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) if (test_bit(node, qs->qs_conn_bm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) qs->qs_connected--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) mlog_bug_on_msg(qs->qs_connected < 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) "node %u, connected %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) node, qs->qs_connected);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) clear_bit(node, qs->qs_conn_bm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) if (test_bit(node, qs->qs_hb_bm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) o2quo_set_hold(qs, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) mlog(0, "node %u, %d total\n", node, qs->qs_connected);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) spin_unlock(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) void o2quo_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) struct o2quo_state *qs = &o2quo_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) spin_lock_init(&qs->qs_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) INIT_WORK(&qs->qs_work, o2quo_make_decision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) void o2quo_exit(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) struct o2quo_state *qs = &o2quo_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) flush_work(&qs->qs_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) }