aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2021-04-15 10:47:23 -0700
committerDarrick J. Wong <djwong@kernel.org>2021-07-19 17:57:58 -0700
commit674a3454cd9d70a84b90f6874236e08a186e35d2 (patch)
treec930ca3baa6092941fc6f0a73922a5226e6dcab1
parentde99bbbddcb678f69a442e3acc8bf1fc8e6f58e6 (diff)
downloadxfs-linux-674a3454cd9d70a84b90f6874236e08a186e35d2.tar.gz
xfs: allow queued AG intents to drain before scrubbing
Notice: this object is not reachable from any branch.
scrub-drain-intents_2021-07-19
Currently, online scrub isn't sufficiently careful about quiescing allocation groups before checking them. While scrub does take the AG header locks, it doesn't serialize against chains of AG update intents that are being processed concurrently. If there's a collision, cross-referencing between data structures (e.g. rmapbt and refcountbt) can yield false corruption events; if repair is running, this results in incorrect repairs. Fix this by adding to the perag structure the count of active intents and make scrub wait until there aren't any to continue. This is a little stupid since transactions can queue intents without taking buffer locks, but we'll also wait for those transactions. XXX: should have instead a per-ag rwsem that gets taken as soon as the AG[IF] are locked and stays held until the transaction commits or moves on to the next AG? would we rather have a six lock so that intents can take an ix lock, and not have to upgrade to x until we actually want to make changes to that ag? is that how those even work?? Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Notice: this object is not reachable from any branch.
-rw-r--r--fs/xfs/libxfs/xfs_ag.c2
-rw-r--r--fs/xfs/libxfs/xfs_ag.h9
-rw-r--r--fs/xfs/libxfs/xfs_defer.c9
-rw-r--r--fs/xfs/libxfs/xfs_defer.h3
-rw-r--r--fs/xfs/scrub/bmap_repair.c4
-rw-r--r--fs/xfs/scrub/common.c147
-rw-r--r--fs/xfs/scrub/common.h2
-rw-r--r--fs/xfs/scrub/inode_repair.c4
-rw-r--r--fs/xfs/scrub/repair.c3
-rw-r--r--fs/xfs/scrub/trace.h31
-rw-r--r--fs/xfs/xfs_bmap_item.c33
-rw-r--r--fs/xfs/xfs_extfree_item.c22
-rw-r--r--fs/xfs/xfs_mount.c75
-rw-r--r--fs/xfs/xfs_mount.h55
-rw-r--r--fs/xfs/xfs_refcount_item.c18
-rw-r--r--fs/xfs/xfs_rmap_item.c18
-rw-r--r--fs/xfs/xfs_super.c7
-rw-r--r--fs/xfs/xfs_swapext_item.c10
-rw-r--r--fs/xfs/xfs_trace.h105
19 files changed, 535 insertions, 22 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 644d7116b368f..817a015fe0b90 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -195,6 +195,7 @@ xfs_free_perag(
ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0);
ASSERT(pag->pag_ici_needs_inactive == 0);
+ ASSERT(atomic_read(&pag->pag_intents) == 0);
unregister_shrinker(&pag->pag_inodegc_shrink);
cancel_delayed_work_sync(&pag->pag_blockgc_work);
@@ -259,6 +260,7 @@ xfs_initialize_perag(
INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
INIT_DELAYED_WORK(&pag->pag_inodegc_work, xfs_inodegc_worker);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+ init_waitqueue_head(&pag->pag_intents_wq);
init_waitqueue_head(&pag->pagb_wait);
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index acd1f3017372f..c01cd8f5f61e4 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -114,6 +114,15 @@ struct xfs_perag {
* or have some other means to control concurrency.
*/
struct rhashtable pagi_unlinked_hash;
+
+ /*
+ * Counter of live intents. We track the number of log intent items
+ * that have been queued (but not yet processed) so that scrub can
+ * detect the presence of other threads that are in the middle of
+ * processing a chain of deferred items.
+ */
+ atomic_t pag_intents;
+ wait_queue_head_t pag_intents_wq;
};
int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index dc6fbfc2cadd7..3b3e47f8ca07c 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -340,7 +340,8 @@ xfs_defer_cancel_list(
list_for_each_safe(pwi, n, &dfp->dfp_work) {
list_del(pwi);
dfp->dfp_count--;
- ops->cancel_item(pwi);
+ trace_xfs_defer_cancel_item(mp, dfp, pwi);
+ ops->cancel_item(mp, pwi);
}
ASSERT(dfp->dfp_count == 0);
kmem_free(dfp);
@@ -419,6 +420,7 @@ xfs_defer_finish_one(
list_for_each_safe(li, n, &dfp->dfp_work) {
list_del(li);
dfp->dfp_count--;
+ trace_xfs_defer_finish_item(tp->t_mountp, dfp, li);
error = ops->finish_item(tp, dfp->dfp_done, li, &state);
if (error == -EAGAIN) {
/*
@@ -562,7 +564,7 @@ xfs_defer_add(
struct list_head *li)
{
struct xfs_defer_pending *dfp = NULL;
- const struct xfs_defer_op_type *ops;
+ const struct xfs_defer_op_type *ops = defer_op_types[type];
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
@@ -575,7 +577,6 @@ xfs_defer_add(
if (!list_empty(&tp->t_dfops)) {
dfp = list_last_entry(&tp->t_dfops,
struct xfs_defer_pending, dfp_list);
- ops = defer_op_types[dfp->dfp_type];
if (dfp->dfp_type != type ||
(ops->max_items && dfp->dfp_count >= ops->max_items))
dfp = NULL;
@@ -593,6 +594,8 @@ xfs_defer_add(
}
list_add_tail(li, &dfp->dfp_work);
+ trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
+ ops->add_item(tp->t_mountp, li);
dfp->dfp_count++;
}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 99ff9feb0d9b6..b6208378ce346 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -55,7 +55,8 @@ struct xfs_defer_op_type {
struct list_head *item, struct xfs_btree_cur **state);
void (*finish_cleanup)(struct xfs_trans *tp,
struct xfs_btree_cur *state, int error);
- void (*cancel_item)(struct list_head *item);
+ void (*cancel_item)(struct xfs_mount *mp, struct list_head *item);
+ void (*add_item)(struct xfs_mount *mp, const struct list_head *item);
unsigned int max_items;
};
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 9b49c3a59e218..6037a69338570 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -332,7 +332,9 @@ xrep_bmap_scan_rt(
if (xrep_is_rtmeta_ino(sc, sc->ip->i_ino))
return 0;
- xchk_rt_lock(sc, &sc->sr);
+ error = xchk_rt_lock(sc, &sc->sr);
+ if (error)
+ return error;
xrep_rt_btcur_init(sc, &sc->sr);
error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_bmap_walk_rtrmap, rb);
xchk_rt_btcur_free(&sc->sr);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index f381d05487a21..7bef5a4aecb32 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -486,8 +486,8 @@ want_ag_read_header_failure(
* all the buffers we grab to the scrub transaction so they'll all be freed
* when we cancel it. Returns ENOENT if we can't grab the perag structure.
*/
-int
-xchk_ag_read_headers(
+static inline int
+__xchk_ag_read_headers(
struct xfs_scrub *sc,
xfs_agnumber_t agno,
struct xchk_ag *sa)
@@ -495,13 +495,6 @@ xchk_ag_read_headers(
struct xfs_mount *mp = sc->mp;
int error;
- ASSERT(!sa->pag);
- sa->pag = xfs_perag_get(mp, agno);
- if (!sa->pag)
- return -ENOENT;
-
- sa->agno = agno;
-
error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
return error;
@@ -517,6 +510,89 @@ xchk_ag_read_headers(
return 0;
}
+static inline bool
+xchk_ag_intents_pending(
+ struct xfs_perag *pag)
+{
+ int intents = atomic_read(&pag->pag_intents);
+
+ trace_xchk_ag_read_headers(pag->pag_mount, pag->pag_agno, intents,
+ _RET_IP_);
+
+ return intents > 0;
+}
+
+/*
+ * Grab all the headers for an AG, and wait until there aren't any pending
+ * intents.
+ */
+int
+xchk_ag_read_headers(
+ struct xfs_scrub *sc,
+ xfs_agnumber_t agno,
+ struct xchk_ag *sa)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ ASSERT(!sa->pag);
+ sa->pag = xfs_perag_get(mp, agno);
+ if (!sa->pag)
+ return -ENOENT;
+
+ sa->agno = agno;
+
+ do {
+ error = __xchk_ag_read_headers(sc, agno, sa);
+ if (error)
+ break;
+
+ /*
+ * Decide if this AG is quiet enough for all metadata to be
+ * consistent with each other. XFS allows the AG header buffer
+ * locks to cycle across transaction rolls while processing
+ * chains of deferred ops, which means that there could be
+ * other threads in the middle of processing a chain of
+ * deferred ops. For regular operations we are careful about
+ * ordering operations to prevent collisions between threads
+ * (which is why we don't need a per-AG lock), but scrub and
+ * repair have to serialize against chained operations.
+ *
+ * We just locked all the AG headers buffers; now take a look
+ * to see if there are any intents in progress. If there are,
+ * drop the AG headers and wait for the intents to drain.
+ * Since we hold all the AG header locks for the duration of
+ * the scrub, this is the only time we have to sample the
+ * intents counter; any threads increasing it after this point
+ * can't possibly be in the middle of a chain of AG metadata
+ * updates.
+ */
+ if (!xchk_ag_intents_pending(sa->pag)) {
+ error = 0;
+ break;
+ }
+
+ if (sa->agfl_bp) {
+ xfs_trans_brelse(sc->tp, sa->agfl_bp);
+ sa->agfl_bp = NULL;
+ }
+
+ if (sa->agf_bp) {
+ xfs_trans_brelse(sc->tp, sa->agf_bp);
+ sa->agf_bp = NULL;
+ }
+
+ if (sa->agi_bp) {
+ xfs_trans_brelse(sc->tp, sa->agi_bp);
+ sa->agi_bp = NULL;
+ }
+
+ error = xfs_perag_wait_intents(sa->pag);
+ } while (!error);
+
+ return error;
+}
+
/* Release all the AG btree cursors. */
void
xchk_ag_btcur_free(
@@ -644,14 +720,59 @@ xchk_ag_init(
return 0;
}
-/* Lock everything we need to work on realtime metadata. */
-void
+#if IS_ENABLED(CONFIG_XFS_RT)
+static inline bool
+xchk_rt_intents_pending(
+ struct xfs_mount *mp)
+{
+ int intents = atomic_read(&mp->m_rt_intents);
+
+ trace_xchk_rt_lock(mp, -1U, intents, _RET_IP_);
+
+ return intents > 0;
+}
+#else
+# define xchk_rt_intents_pending(mp) (false)
+#endif
+
+/* Lock everything we need to work on realtime metadata and wait for intents. */
+int
xchk_rt_lock(
struct xfs_scrub *sc,
struct xchk_rt *sr)
{
- xfs_rtlock(NULL, sc->mp, XFS_RTLOCK_ALL);
- sr->locked = true;
+ int error;
+
+ do {
+ xfs_rtlock(NULL, sc->mp, XFS_RTLOCK_ALL);
+
+ /*
+ * Decide if the RT volume is quiet enough for all metadata to
+ * be consistent with each other. Regular file IO doesn't get
+ * to lock all the rt inodes at the same time, which means that
+ * there could be other threads in the middle of processing a
+ * chain of deferred ops.
+ *
+ * We just locked all the rt inodes; now take a look to see if
+ * there are any rt intents in progress. If there are, drop
+ * the rt inode locks and wait for the intents to drain. Since
+ * we hold the rt inode locks for the duration of the scrub,
+ * this is the only time we have to sample the intents counter;
+ * any threads increasing it after this point can't possibly be
+ * in the middle of a chain of rt metadata updates.
+ */
+ if (!xchk_rt_intents_pending(sc->mp)) {
+ sr->locked = true;
+ error = 0;
+ break;
+ }
+
+ xfs_rtunlock(sc->mp, XFS_RTLOCK_ALL);
+
+ error = xfs_rt_wait_intents(sc->mp);
+ } while (!error);
+
+ return error;
}
/*
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index dbd4145690b2e..70a42897cd22e 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -153,7 +153,7 @@ xchk_ag_init_existing(
void xchk_rt_init(struct xfs_scrub *sc, struct xchk_rt *sr);
void xchk_rt_btcur_free(struct xchk_rt *sr);
-void xchk_rt_lock(struct xfs_scrub *sc, struct xchk_rt *sr);
+int xchk_rt_lock(struct xfs_scrub *sc, struct xchk_rt *sr);
void xchk_rt_unlock(struct xfs_scrub *sc, struct xchk_rt *sr);
int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
struct xchk_ag *sa);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 31ecdb2fd32f1..f17f31fd3eb6f 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -597,7 +597,9 @@ xrep_dinode_count_rt_rmaps(
xrep_is_rtmeta_ino(sc, sc->sm->sm_ino))
return 0;
- xchk_rt_lock(sc, &sc->sr);
+ error = xchk_rt_lock(sc, &sc->sr);
+ if (error)
+ return error;
xrep_rt_btcur_init(sc, &sc->sr);
error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_dinode_walk_rtrmap,
dis);
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index ba639854a5cb5..c0730b73ebdad 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -483,6 +483,7 @@ xrep_newbt_schedule_reap(
INIT_LIST_HEAD(&efi_item.xefi_list);
list_add(&efi_item.xefi_list, &items);
+ xfs_fs_bump_intents(xnr->sc->mp, false, resv->fsbno);
resv->efi = xfs_extent_free_defer_type.create_intent(xnr->sc->tp,
&items, 1, false);
}
@@ -713,6 +714,7 @@ xrep_newbt_destroy(
goto junkit;
list_del(&resv->list);
+ xfs_fs_drop_intents(sc->mp, false, resv->fsbno);
kmem_free(resv);
}
@@ -725,6 +727,7 @@ junkit:
list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
xfs_extent_free_defer_type.abort_intent(resv->efi);
list_del(&resv->list);
+ xfs_fs_drop_intents(sc->mp, false, resv->fsbno);
kmem_free(resv);
}
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index adf78aee917ea..2dd3641efdfbe 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -704,6 +704,37 @@ TRACE_EVENT(xchk_iallocbt_check_cluster,
__entry->cluster_ino)
)
+DECLARE_EVENT_CLASS(xchk_ag_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+ unsigned long caller_ip),
+ TP_ARGS(mp, agno, refcount, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, refcount)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->refcount = refcount;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d agno %u refcount %d caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->refcount,
+ (char *)__entry->caller_ip)
+);
+
+#define DEFINE_XCHK_AG_EVENT(name) \
+DEFINE_EVENT(xchk_ag_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
+ unsigned long caller_ip), \
+ TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_XCHK_AG_EVENT(xchk_ag_read_headers);
+DEFINE_XCHK_AG_EVENT(xchk_rt_lock);
+
TRACE_EVENT(xchk_fscounters_calc,
TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree,
uint64_t fdblocks, uint64_t delalloc),
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 687527dde62ce..354f78ffab63e 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -370,6 +370,7 @@ xfs_bmap_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_mount *mp = tp->t_mountp;
struct xfs_bmap_intent *bmap;
xfs_filblks_t count;
int error;
@@ -388,6 +389,10 @@ xfs_bmap_update_finish_item(
bmap->bi_bmap.br_blockcount = count;
return -EAGAIN;
}
+
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ xfs_fs_drop_intents(mp, XFS_IS_REALTIME_INODE(bmap->bi_owner),
+ bmap->bi_bmap.br_startblock);
kmem_free(bmap);
return error;
}
@@ -400,17 +405,42 @@ xfs_bmap_update_abort_intent(
xfs_bui_release(BUI_ITEM(intent));
}
-/* Cancel a deferred rmap update. */
+/* Cancel a deferred bmap update. */
STATIC void
xfs_bmap_update_cancel_item(
+ struct xfs_mount *mp,
struct list_head *item)
{
struct xfs_bmap_intent *bmap;
bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ xfs_fs_drop_intents(mp, XFS_IS_REALTIME_INODE(bmap->bi_owner),
+ bmap->bi_bmap.br_startblock);
kmem_free(bmap);
}
+/* Add a deferred bmap update. */
+STATIC void
+xfs_bmap_update_add_item(
+ struct xfs_mount *mp,
+ const struct list_head *item)
+{
+ const struct xfs_bmap_intent *bmap;
+
+ bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+
+ /*
+ * Decide if it's necessary to bump the live intent counter on behalf
+ * of the deferred rmap intent item we will queue when we finish this
+ * bmap work.
+ */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ xfs_fs_bump_intents(mp, XFS_IS_REALTIME_INODE(bmap->bi_owner),
+ bmap->bi_bmap.br_startblock);
+}
+
const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
.max_items = XFS_BUI_MAX_FAST_EXTENTS,
.create_intent = xfs_bmap_update_create_intent,
@@ -418,6 +448,7 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
.create_done = xfs_bmap_update_create_done,
.finish_item = xfs_bmap_update_finish_item,
.cancel_item = xfs_bmap_update_cancel_item,
+ .add_item = xfs_bmap_update_add_item,
};
/* Is this recovered BUI ok? */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 5be45d00a32dc..7b83680dd5476 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -490,6 +490,7 @@ xfs_extent_free_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_mount *mp = tp->t_mountp;
struct xfs_extent_free_item *free;
int error;
@@ -500,7 +501,7 @@ xfs_extent_free_finish_item(
* haven't locked the rt inodes yet.
*/
if (*state == NULL && free->xefi_realtime) {
- xfs_rtlock(tp, tp->t_mountp, XFS_RTLOCK_ALLOC);
+ xfs_rtlock(tp, mp, XFS_RTLOCK_ALLOC);
*state = (struct xfs_btree_cur *)1;
}
@@ -508,6 +509,8 @@ xfs_extent_free_finish_item(
free->xefi_startblock,
free->xefi_blockcount, free->xefi_realtime,
&free->xefi_oinfo, free->xefi_skip_discard);
+
+ xfs_fs_drop_intents(mp, free->xefi_realtime, free->xefi_startblock);
kmem_free(free);
return error;
}
@@ -523,14 +526,28 @@ xfs_extent_free_abort_intent(
/* Cancel a free extent. */
STATIC void
xfs_extent_free_cancel_item(
+ struct xfs_mount *mp,
struct list_head *item)
{
struct xfs_extent_free_item *free;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ xfs_fs_drop_intents(mp, free->xefi_realtime, free->xefi_startblock);
kmem_free(free);
}
+/* Add a deferred free extent. */
+STATIC void
+xfs_extent_free_add_item(
+ struct xfs_mount *mp,
+ const struct list_head *item)
+{
+ const struct xfs_extent_free_item *free;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ xfs_fs_bump_intents(mp, free->xefi_realtime, free->xefi_startblock);
+}
+
const struct xfs_defer_op_type xfs_extent_free_defer_type = {
.max_items = XFS_EFI_MAX_FAST_EXTENTS,
.create_intent = xfs_extent_free_create_intent,
@@ -538,6 +555,7 @@ const struct xfs_defer_op_type xfs_extent_free_defer_type = {
.create_done = xfs_extent_free_create_done,
.finish_item = xfs_extent_free_finish_item,
.cancel_item = xfs_extent_free_cancel_item,
+ .add_item = xfs_extent_free_add_item,
};
/*
@@ -591,6 +609,7 @@ xfs_agfl_free_finish_item(
extp->ext_len = free->xefi_blockcount;
efdp->efd_next_extent++;
+ xfs_fs_drop_intents(mp, free->xefi_realtime, free->xefi_startblock);
kmem_free(free);
return error;
}
@@ -603,6 +622,7 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
.create_done = xfs_extent_free_create_done,
.finish_item = xfs_agfl_free_finish_item,
.cancel_item = xfs_extent_free_cancel_item,
+ .add_item = xfs_extent_free_add_item,
};
/* Is this recovered EFI ok? */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c68c7ae5c12ae..a6e1383197914 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1468,3 +1468,78 @@ xfs_hook_call(
{
return srcu_notifier_call_chain(&chain->head, val, priv);
}
+
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB)
+
+#if IS_ENABLED(CONFIG_XFS_RT)
+void
+xfs_rt_bump_intents(
+ struct xfs_mount *mp)
+{
+ trace_xfs_rt_bump_intents(mp, __return_address);
+
+ atomic_inc(&mp->m_rt_intents);
+}
+
+void
+xfs_rt_drop_intents(
+ struct xfs_mount *mp)
+{
+ trace_xfs_rt_drop_intents(mp, __return_address);
+
+ ASSERT(atomic_read(&mp->m_rt_intents) > 0);
+
+ if (atomic_dec_and_test(&mp->m_rt_intents))
+ wake_up(&mp->m_rt_intents_wq);
+}
+
+int
+xfs_rt_wait_intents(
+ struct xfs_mount *mp)
+{
+ trace_xfs_rt_wait_intents(mp, __return_address);
+
+ return wait_event_killable(mp->m_rt_intents_wq,
+ atomic_read(&mp->m_rt_intents) == 0);
+}
+#endif /* CONFIG_XFS_RT */
+
+void
+xfs_ag_bump_intents(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag = xfs_perag_get(mp, agno);
+
+ trace_xfs_perag_bump_intents(pag, __return_address);
+
+ atomic_inc(&pag->pag_intents);
+ xfs_perag_put(pag);
+}
+
+void
+xfs_ag_drop_intents(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag = xfs_perag_get(mp, agno);
+
+ trace_xfs_perag_drop_intents(pag, __return_address);
+
+ ASSERT(atomic_read(&pag->pag_intents) > 0);
+
+ if (atomic_dec_and_test(&pag->pag_intents))
+ wake_up(&pag->pag_intents_wq);
+ xfs_perag_put(pag);
+}
+
+int
+xfs_perag_wait_intents(
+ struct xfs_perag *pag)
+{
+ trace_xfs_perag_wait_intents(pag, __return_address);
+
+ return wait_event_killable(pag->pag_intents_wq,
+ atomic_read(&pag->pag_intents) == 0);
+}
+#endif /* CONFIG_XFS_ONLINE_SCRUB */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index ca3e17d4f93c6..a096c7b81b3c9 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -253,6 +253,17 @@ typedef struct xfs_mount {
/* online nlink check stuff */
struct xfs_hook_chain m_nlink_mod_hooks;
#endif
+
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) && IS_ENABLED(CONFIG_XFS_RT)
+ /*
+ * Counter of live intents. We track the number of log intent items
+ * that have been queued (but not yet processed) so that scrub can
+ * detect the presence of other threads that are in the middle of
+ * processing a chain of deferred items.
+ */
+ atomic_t m_rt_intents;
+ wait_queue_head_t m_rt_intents_wq;
+#endif
} xfs_mount_t;
/* Parameters for xfs_bumplink/droplink hook. */
@@ -394,5 +405,49 @@ int xfs_hook_add(struct xfs_hook_chain *chain, struct notifier_block *hook,
notifier_fn_t fn);
void xfs_hook_del(struct xfs_hook_chain *chain, struct notifier_block *hook);
int xfs_hook_call(struct xfs_hook_chain *chain, unsigned long val, void *priv);
+
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB)
+
+# if IS_ENABLED(CONFIG_XFS_RT)
+void xfs_rt_bump_intents(struct xfs_mount *mp);
+void xfs_rt_drop_intents(struct xfs_mount *mp);
+int xfs_rt_wait_intents(struct xfs_mount *mp);
+# endif /* CONFIG_XFS_RT */
+
+void xfs_ag_bump_intents(struct xfs_mount *mp, xfs_agnumber_t agno);
+void xfs_ag_drop_intents(struct xfs_mount *mp, xfs_agnumber_t agno);
+int xfs_perag_wait_intents(struct xfs_perag *pag);
+
+static inline void
+xfs_fs_bump_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb)
+{
+ if (isrt)
+ xfs_rt_bump_intents(mp);
+ else
+ xfs_ag_bump_intents(mp, XFS_FSB_TO_AGNO(mp, fsb));
+}
+
+static inline void
+xfs_fs_drop_intents(struct xfs_mount *mp, bool isrt, xfs_fsblock_t fsb)
+{
+ if (isrt)
+ xfs_rt_drop_intents(mp);
+ else
+ xfs_ag_drop_intents(mp, XFS_FSB_TO_AGNO(mp, fsb));
+}
+
+#else
+# define xfs_ag_bump_intents(mp, agno) ((void)0)
+# define xfs_ag_drop_intents(mp, agno) ((void)0)
+# define xfs_perag_wait_intents(pag) (-ENOSYS)
+# define xfs_fs_bump_intents(mp, isrt, fsb) ((void)0)
+# define xfs_fs_drop_intents(mp, isrt, fsb) ((void)0)
+#endif /* CONFIG_XFS_ONLINE_SCRUB */
+
+#if !IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) || !IS_ENABLED(CONFIG_XFS_RT)
+# define xfs_rt_bump_intents(mp) ((void)0)
+# define xfs_rt_drop_intents(mp) ((void)0)
+# define xfs_rt_wait_intents(mp) (-ENOSYS)
+#endif
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 3cacfd2fc95e7..afab964f6456c 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -379,6 +379,7 @@ xfs_refcount_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_mount *mp = tp->t_mountp;
struct xfs_refcount_intent *refc;
xfs_fsblock_t new_fsb;
xfs_filblks_t new_aglen;
@@ -397,6 +398,8 @@ xfs_refcount_update_finish_item(
refc->ri_blockcount = new_aglen;
return -EAGAIN;
}
+
+ xfs_fs_drop_intents(mp, refc->ri_realtime, refc->ri_startblock);
kmem_free(refc);
return error;
}
@@ -412,14 +415,28 @@ xfs_refcount_update_abort_intent(
/* Cancel a deferred refcount update. */
STATIC void
xfs_refcount_update_cancel_item(
+ struct xfs_mount *mp,
struct list_head *item)
{
struct xfs_refcount_intent *refc;
refc = container_of(item, struct xfs_refcount_intent, ri_list);
+ xfs_fs_drop_intents(mp, refc->ri_realtime, refc->ri_startblock);
kmem_free(refc);
}
+/* Add a deferred refcount update. */
+STATIC void
+xfs_refcount_update_add_item(
+ struct xfs_mount *mp,
+ const struct list_head *item)
+{
+ const struct xfs_refcount_intent *refc;
+
+ refc = container_of(item, struct xfs_refcount_intent, ri_list);
+ xfs_fs_bump_intents(mp, refc->ri_realtime, refc->ri_startblock);
+}
+
const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
.max_items = XFS_CUI_MAX_FAST_EXTENTS,
.create_intent = xfs_refcount_update_create_intent,
@@ -428,6 +445,7 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
.finish_item = xfs_refcount_update_finish_item,
.finish_cleanup = xfs_refcount_finish_one_cleanup,
.cancel_item = xfs_refcount_update_cancel_item,
+ .add_item = xfs_refcount_update_add_item,
};
/* Is this recovered CUI ok? */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 78931726242bd..6b3d10de134e0 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -430,6 +430,7 @@ xfs_rmap_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_mount *mp = tp->t_mountp;
struct xfs_rmap_intent *rmap;
int error;
@@ -439,6 +440,8 @@ xfs_rmap_update_finish_item(
rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock,
rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state,
rmap->ri_realtime, state);
+
+ xfs_fs_drop_intents(mp, rmap->ri_realtime, rmap->ri_bmap.br_startblock);
kmem_free(rmap);
return error;
}
@@ -454,14 +457,28 @@ xfs_rmap_update_abort_intent(
/* Cancel a deferred rmap update. */
STATIC void
xfs_rmap_update_cancel_item(
+ struct xfs_mount *mp,
struct list_head *item)
{
struct xfs_rmap_intent *rmap;
rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+ xfs_fs_drop_intents(mp, rmap->ri_realtime, rmap->ri_bmap.br_startblock);
kmem_free(rmap);
}
+/* Add a deferred rmap update. */
+STATIC void
+xfs_rmap_update_add_item(
+ struct xfs_mount *mp,
+ const struct list_head *item)
+{
+ const struct xfs_rmap_intent *rmap;
+
+ rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+ xfs_fs_bump_intents(mp, rmap->ri_realtime, rmap->ri_bmap.br_startblock);
+}
+
const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
.max_items = XFS_RUI_MAX_FAST_EXTENTS,
.create_intent = xfs_rmap_update_create_intent,
@@ -470,6 +487,7 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
.finish_item = xfs_rmap_update_finish_item,
.finish_cleanup = xfs_rmap_finish_one_cleanup,
.cancel_item = xfs_rmap_update_cancel_item,
+ .add_item = xfs_rmap_update_add_item,
};
/* Is this recovered RUI ok? */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bb31dc4cb3d5b..807be2305a4a7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -704,6 +704,9 @@ xfs_mount_free(
ASSERT(!mutex_is_locked(&mp->m_scrub_freeze));
mutex_destroy(&mp->m_scrub_freeze);
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) && IS_ENABLED(CONFIG_XFS_RT)
+ ASSERT(atomic_read(&mp->m_rt_intents) == 0);
+#endif
kmem_free(mp);
}
@@ -1927,6 +1930,10 @@ static int xfs_init_fs_context(
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
mp->m_kobj.kobject.kset = xfs_kset;
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) && IS_ENABLED(CONFIG_XFS_RT)
+ init_waitqueue_head(&mp->m_rt_intents_wq);
+ atomic_set(&mp->m_rt_intents, 0);
+#endif
/*
* We don't create the finobt per-ag space reservation until after log
* recovery, so we must set this to true so that an ifree transaction
diff --git a/fs/xfs/xfs_swapext_item.c b/fs/xfs/xfs_swapext_item.c
index 60aba7f30b641..96b6ed4c24ce2 100644
--- a/fs/xfs/xfs_swapext_item.c
+++ b/fs/xfs/xfs_swapext_item.c
@@ -346,6 +346,7 @@ xfs_swapext_abort_intent(
/* Cancel a deferred swapext update. */
STATIC void
xfs_swapext_cancel_item(
+ struct xfs_mount *mp,
struct list_head *item)
{
struct xfs_swapext_intent *sxi;
@@ -354,6 +355,14 @@ xfs_swapext_cancel_item(
kmem_free(sxi);
}
+/* Add a deferred swapext update. */
+STATIC void
+xfs_swapext_add_item(
+ struct xfs_mount *mp,
+ const struct list_head *item)
+{
+}
+
const struct xfs_defer_op_type xfs_swapext_defer_type = {
.max_items = XFS_SXI_MAX_FAST_EXTENTS,
.create_intent = xfs_swapext_create_intent,
@@ -361,6 +370,7 @@ const struct xfs_defer_op_type xfs_swapext_defer_type = {
.create_done = xfs_swapext_create_done,
.finish_item = xfs_swapext_finish_item,
.cancel_item = xfs_swapext_cancel_item,
+ .add_item = xfs_swapext_add_item,
};
/* Is this recovered SXI ok? */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 985b0e954b020..60c9c81dc1816 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -197,6 +197,8 @@ DEFINE_FS_EVENT(xfs_inodegc_delay_mempressure);
DEFINE_FS_EVENT(xfs_blockgc_start);
DEFINE_FS_EVENT(xfs_blockgc_stop);
DEFINE_FS_EVENT(xfs_blockgc_flush_all);
+DEFINE_FS_EVENT(xfs_force_shutdown1);
+DEFINE_FS_EVENT(xfs_force_shutdown2);
TRACE_EVENT(xfs_inodegc_requeue_mempressure,
TP_PROTO(struct xfs_perag *pag, unsigned long nr, void *caller_ip),
@@ -2864,6 +2866,44 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
+DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp,
+ void *item),
+ TP_ARGS(mp, dfp, item),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, type)
+ __field(void *, intent)
+ __field(void *, item)
+ __field(char, committed)
+ __field(int, nr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp ? mp->m_super->s_dev : 0;
+ __entry->type = dfp->dfp_type;
+ __entry->intent = dfp->dfp_intent;
+ __entry->item = item;
+ __entry->committed = dfp->dfp_done != NULL;
+ __entry->nr = dfp->dfp_count;
+ ),
+ TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->intent,
+ __entry->item,
+ __entry->committed,
+ __entry->nr)
+)
+#define DEFINE_DEFER_PENDING_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_defer_pending_item_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, \
+ void *item), \
+ TP_ARGS(mp, dfp, item))
+
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_add_item);
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_cancel_item);
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item);
+
#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
@@ -4811,6 +4851,71 @@ TRACE_EVENT(xfs_force_shutdown,
__entry->line_num)
);
+DECLARE_EVENT_CLASS(xfs_perag_intents_class,
+ TP_PROTO(struct xfs_perag *pag, void *caller_ip),
+ TP_ARGS(pag, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(long, nr_intents)
+ __field(void *, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB)
+ __entry->nr_intents = atomic_read(&pag->pag_intents);
+#else
+ __entry->nr_intents = -1;
+#endif
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d agno %u intents %ld caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->nr_intents,
+ __entry->caller_ip)
+);
+
+#define DEFINE_PERAG_INTENTS_EVENT(name) \
+DEFINE_EVENT(xfs_perag_intents_class, name, \
+ TP_PROTO(struct xfs_perag *pag, void *caller_ip), \
+ TP_ARGS(pag, caller_ip))
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_bump_intents);
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_drop_intents);
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
+
+DECLARE_EVENT_CLASS(xfs_rt_intents_class,
+ TP_PROTO(struct xfs_mount *mp, void *caller_ip),
+ TP_ARGS(mp, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(long, nr_intents)
+ __field(void *, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+#if IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) && IS_ENABLED(CONFIG_XFS_RT)
+ __entry->nr_intents = atomic_read(&mp->m_rt_intents);
+#else
+ __entry->nr_intents = -1;
+#endif
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d intents %ld caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->nr_intents,
+ __entry->caller_ip)
+);
+
+#define DEFINE_RT_INTENTS_EVENT(name) \
+DEFINE_EVENT(xfs_rt_intents_class, name, \
+ TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
+ TP_ARGS(mp, caller_ip))
+DEFINE_RT_INTENTS_EVENT(xfs_rt_bump_intents);
+DEFINE_RT_INTENTS_EVENT(xfs_rt_drop_intents);
+DEFINE_RT_INTENTS_EVENT(xfs_rt_wait_intents);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH