^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) #ifndef _BCACHE_JOURNAL_H
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #define _BCACHE_JOURNAL_H
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * THE JOURNAL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * The journal is treated as a circular buffer of buckets - a journal entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * never spans two buckets. This means (not implemented yet) we can resize the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * journal at runtime, and will be needed for bcache on raw flash support.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * Journal entries contain a list of keys, ordered by the time they were
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * inserted; thus journal replay just has to reinsert the keys.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * We also keep some things in the journal header that are logically part of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * superblock - all the things that are frequently updated. This is for future
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * bcache on raw flash support; the superblock (which will become another
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * journal) can't be moved or wear leveled, so it contains just enough
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * information to find the main journal, and the superblock only has to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * rewritten when we want to move/wear level the main journal.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * from cache misses, which don't have to be journaled, and for writeback and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * moving gc we work around it by flushing the btree to disk before updating the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * gc information. But it is a potential issue with incremental garbage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * collection, and it's fragile.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * OPEN JOURNAL ENTRIES:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) * Each journal entry contains, in the header, the sequence number of the last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) * journal entry still open - i.e. that has keys that haven't been flushed to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) * disk in the btree.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) * We track this by maintaining a refcount for every open journal entry, in a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) * fifo; each entry in the fifo corresponds to a particular journal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) * entry/sequence number. When the refcount at the tail of the fifo goes to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) * zero, we pop it off - thus, the size of the fifo tells us the number of open
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) * journal entries
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) * We take a refcount on a journal entry when we add some keys to a journal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) * entry that we're going to insert (held by struct btree_op), and then when we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * insert those keys into the btree the btree write we're setting up takes a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) * copy of that refcount (held by struct btree_write). That refcount is dropped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * when the btree write completes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * A struct btree_write can only hold a refcount on a single journal entry, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * might contain keys for many journal entries - we handle this by making sure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * it always has a refcount on the _oldest_ journal entry of all the journal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) * entries it has keys for.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * JOURNAL RECLAIM:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * As mentioned previously, our fifo of refcounts tells us the number of open
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * journal entries; from that and the current journal sequence number we compute
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) * last_seq - the oldest journal entry we still need. We write last_seq in each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) * journal entry, and we also have to keep track of where it exists on disk so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) * we don't overwrite it when we loop around the journal.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) * To do that we track, for each journal bucket, the sequence number of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) * newest journal entry it contains - if we don't need that journal entry we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) * don't need anything in that bucket anymore. From that we track the last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) * journal bucket we still need; all this is tracked in struct journal_device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) * and updated by journal_reclaim().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * JOURNAL FILLING UP:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * There are two ways the journal could fill up; either we could run out of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) * space to write to, or we could have too many open journal entries and run out
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * of room in the fifo of refcounts. Since those refcounts are decremented
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) * without any locking we can't safely resize that fifo, so we handle it the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) * same way.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) * If the journal fills up, we start flushing dirty btree nodes until we can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * allocate space for a journal write again - preferentially flushing btree
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) * nodes that are pinning the oldest journal entries first.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) * Only used for holding the journal entries we read in btree_journal_read()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) * during cache_registration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) struct journal_replay {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) struct list_head list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) atomic_t *pin;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) struct jset j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) * We put two of these in struct journal; we used them for writes to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) * journal that are being staged or in flight.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) struct journal_write {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) struct jset *data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) #define JSET_BITS 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) struct cache_set *c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) struct closure_waitlist wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) bool dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) bool need_write;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) /* Embedded in struct cache_set */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) struct journal {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) spinlock_t lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) spinlock_t flush_write_lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) bool btree_flushing;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) /* used when waiting because the journal was full */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) struct closure_waitlist wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) struct closure io;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) int io_in_flight;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) struct delayed_work work;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) /* Number of blocks free in the bucket(s) we're currently writing to */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) unsigned int blocks_free;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) uint64_t seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) DECLARE_FIFO(atomic_t, pin);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) BKEY_PADDED(key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) struct journal_write w[2], *cur;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) * Embedded in struct cache. First three fields refer to the array of journal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) * buckets, in cache_sb.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) struct journal_device {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) * For each journal bucket, contains the max sequence number of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) * journal writes it contains - so we know when a bucket can be reused.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) uint64_t seq[SB_JOURNAL_BUCKETS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) /* Journal bucket we're currently writing to */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) unsigned int cur_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) /* Last journal bucket that still contains an open journal entry */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) unsigned int last_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) /* Next journal bucket to be discarded */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) unsigned int discard_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) #define DISCARD_READY 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) #define DISCARD_IN_FLIGHT 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) #define DISCARD_DONE 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) /* 1 - discard in flight, -1 - discard completed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) atomic_t discard_in_flight;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) struct work_struct discard_work;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) struct bio discard_bio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) struct bio_vec discard_bv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) /* Bio for journal reads/writes to this device */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) struct bio bio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) struct bio_vec bv[8];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) #define BTREE_FLUSH_NR 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) #define journal_pin_cmp(c, l, r) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) #define JOURNAL_PIN 20000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) #define journal_full(j) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) struct closure;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) struct cache_set;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) struct btree_op;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) struct keylist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) atomic_t *bch_journal(struct cache_set *c,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) struct keylist *keys,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) struct closure *parent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) void bch_journal_next(struct journal *j);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) void bch_journal_mark(struct cache_set *c, struct list_head *list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) void bch_journal_meta(struct cache_set *c, struct closure *cl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) int bch_journal_read(struct cache_set *c, struct list_head *list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) int bch_journal_replay(struct cache_set *c, struct list_head *list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) void bch_journal_free(struct cache_set *c);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) int bch_journal_alloc(struct cache_set *c);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) #endif /* _BCACHE_JOURNAL_H */