^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /******************************************************************************
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) *******************************************************************************
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) **
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) ** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) **
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) **
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) *******************************************************************************
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) ******************************************************************************/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include "dlm_internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include "lockspace.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include "member.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include "dir.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include "ast.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include "recover.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include "lowcomms.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include "lock.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include "requestqueue.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include "recoverd.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) /* If the start for which we're re-enabling locking (seq) has been superseded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) by a newer stop (ls_recover_seq), we need to leave locking disabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) locking stopped and b) adds a message to the requestqueue, but dlm_recoverd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) enables locking and clears the requestqueue between a and b. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) static int enable_locking(struct dlm_ls *ls, uint64_t seq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) int error = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) down_write(&ls->ls_recv_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) spin_lock(&ls->ls_recover_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) if (ls->ls_recover_seq == seq) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) set_bit(LSFL_RUNNING, &ls->ls_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) /* unblocks processes waiting to enter the dlm */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) up_write(&ls->ls_in_recovery);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) clear_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) error = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) spin_unlock(&ls->ls_recover_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) up_write(&ls->ls_recv_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) unsigned long start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) int error, neg = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) mutex_lock(&ls->ls_recoverd_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) dlm_callback_suspend(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) dlm_clear_toss(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) * This list of root rsb's will be the basis of most of the recovery
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * routines.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) dlm_create_root_list(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) * Add or remove nodes from the lockspace's ls_nodes list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) error = dlm_recover_members(ls, rv, &neg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) log_rinfo(ls, "dlm_recover_members error %d", error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) dlm_recover_dir_nodeid(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) ls->ls_recover_dir_sent_res = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) ls->ls_recover_dir_sent_msg = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) ls->ls_recover_locks_in = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) dlm_set_recover_status(ls, DLM_RS_NODES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) error = dlm_recover_members_wait(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) log_rinfo(ls, "dlm_recover_members_wait error %d", error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) start = jiffies;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) * Rebuild our own share of the directory by collecting from all other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) * nodes their master rsb names that hash to us.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) error = dlm_recover_directory(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) log_rinfo(ls, "dlm_recover_directory error %d", error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) dlm_set_recover_status(ls, DLM_RS_DIR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) error = dlm_recover_directory_wait(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) log_rinfo(ls, "dlm_recover_directory_wait error %d", error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) log_rinfo(ls, "dlm_recover_directory %u out %u messages",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) * We may have outstanding operations that are waiting for a reply from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) * a failed node. Mark these to be resent after recovery. Unlock and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) * cancel ops can just be completed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) dlm_recover_waiters_pre(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) error = dlm_recovery_stopped(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) if (neg || dlm_no_directory(ls)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) * Clear lkb's for departed nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) dlm_recover_purge(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) * Get new master nodeid's for rsb's that were mastered on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) * departed nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) error = dlm_recover_masters(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) log_rinfo(ls, "dlm_recover_masters error %d", error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) * Send our locks on remastered rsb's to the new masters.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) error = dlm_recover_locks(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) log_rinfo(ls, "dlm_recover_locks error %d", error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) dlm_set_recover_status(ls, DLM_RS_LOCKS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) error = dlm_recover_locks_wait(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) log_rinfo(ls, "dlm_recover_locks %u in",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) ls->ls_recover_locks_in);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) * Finalize state in master rsb's now that all locks can be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) * checked. This includes conversion resolution and lvb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) * settings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) dlm_recover_rsbs(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * Other lockspace members may be going through the "neg" steps
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) * while also adding us to the lockspace, in which case they'll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) * be doing the recover_locks (RS_LOCKS) barrier.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) dlm_set_recover_status(ls, DLM_RS_LOCKS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) error = dlm_recover_locks_wait(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) dlm_release_root_list(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) * Purge directory-related requests that are saved in requestqueue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) * All dir requests from before recovery are invalid now due to the dir
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) * rebuild and will be resent by the requesting nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) dlm_purge_requestqueue(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) dlm_set_recover_status(ls, DLM_RS_DONE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) error = dlm_recover_done_wait(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) log_rinfo(ls, "dlm_recover_done_wait error %d", error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) dlm_clear_members_gone(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) dlm_adjust_timeouts(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) dlm_callback_resume(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) error = enable_locking(ls, rv->seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) log_rinfo(ls, "enable_locking error %d", error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) error = dlm_process_requestqueue(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) log_rinfo(ls, "dlm_process_requestqueue error %d", error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) error = dlm_recover_waiters_post(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) log_rinfo(ls, "dlm_recover_waiters_post error %d", error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) dlm_recover_grant(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) (unsigned long long)rv->seq, ls->ls_generation,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) jiffies_to_msecs(jiffies - start));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) mutex_unlock(&ls->ls_recoverd_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) dlm_lsop_recover_done(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) fail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) dlm_release_root_list(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) log_rinfo(ls, "dlm_recover %llu error %d",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) (unsigned long long)rv->seq, error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) mutex_unlock(&ls->ls_recoverd_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) /* The dlm_ls_start() that created the rv we take here may already have been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) flag set. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) static void do_ls_recovery(struct dlm_ls *ls)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) struct dlm_recover *rv = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) spin_lock(&ls->ls_recover_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) rv = ls->ls_recover_args;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) ls->ls_recover_args = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) if (rv && ls->ls_recover_seq == rv->seq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) clear_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) spin_unlock(&ls->ls_recover_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) if (rv) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) ls_recover(ls, rv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) kfree(rv->nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) kfree(rv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) static int dlm_recoverd(void *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) struct dlm_ls *ls;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) ls = dlm_find_lockspace_local(arg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) if (!ls) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) log_print("dlm_recoverd: no lockspace %p", arg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) down_write(&ls->ls_in_recovery);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) wake_up(&ls->ls_recover_lock_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) while (1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) * We call kthread_should_stop() after set_current_state().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) * This is because it works correctly if kthread_stop() is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) * called just before set_current_state().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) set_current_state(TASK_INTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) if (kthread_should_stop()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) set_current_state(TASK_RUNNING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) if (kthread_should_stop())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) schedule();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) set_current_state(TASK_RUNNING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) down_write(&ls->ls_in_recovery);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) wake_up(&ls->ls_recover_lock_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) if (test_and_clear_bit(LSFL_RECOVER_WORK, &ls->ls_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) do_ls_recovery(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) up_write(&ls->ls_in_recovery);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) dlm_put_lockspace(ls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) int dlm_recoverd_start(struct dlm_ls *ls)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) struct task_struct *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) int error = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) if (IS_ERR(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) error = PTR_ERR(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) ls->ls_recoverd_task = p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) void dlm_recoverd_stop(struct dlm_ls *ls)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) kthread_stop(ls->ls_recoverd_task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) void dlm_recoverd_suspend(struct dlm_ls *ls)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) wake_up(&ls->ls_wait_general);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) mutex_lock(&ls->ls_recoverd_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) void dlm_recoverd_resume(struct dlm_ls *ls)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) mutex_unlock(&ls->ls_recoverd_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352)