aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLars Ellenberg <lars.ellenberg@linbit.com>2016-06-22 10:22:10 +0200
committerMike Snitzer <snitzer@redhat.com>2016-07-07 10:18:06 -0400
commit36cee4b1ddef0a46562045b421792a847c570b6b (patch)
tree793ffd9772a874320d758b2bd767acbb9fe8fe26
parent4a28ff8e09f81ed855dbc13e0bc977dac96f053c (diff)
downloadlinux-36cee4b1ddef0a46562045b421792a847c570b6b.tar.gz
block: fix blk_queue_split() resource exhaustion
For a long time, generic_make_request() converts recursion into iteration by queuing recursive arguments on current->bio_list. This is convenient for stacking drivers, the top-most driver would take the originally submitted bio, and re-submit a re-mapped version of it, or one or more clones, or one or more new allocated bios to its backend(s). Which are then simply processed in turn, and each can again queue more "backend-bios" until we reach the bottom of the driver stack, and actually dispatch to the real backend device. Any stacking driver ->make_request_fn() could expect that, once it returns, any backend-bios it submitted via recursive calls to generic_make_request() would now be processed and dispatched, before the current task would call into this driver again. This is changed by commit 54efd50 block: make generic_make_request handle arbitrarily sized bios Drivers may call blk_queue_split() inside their ->make_request_fn(), which may split the current bio into a front-part to be dealt with immediately, and a remainder-part, which may need to be split even further. That remainder-part will simply also be pushed to current->bio_list, and would end up being head-of-queue, in front of any backend-bios the current make_request_fn() might submit during processing of the fron-part. Which means the current task would immediately end up back in the same make_request_fn() of the same driver again, before any of its backend bios have even been processed. This can lead to resource starvation deadlock. Drivers could avoid this by learning to not need blk_queue_split(), or by submitting their backend bios in a different context (dedicated kernel thread, work_queue context, ...). Or by playing funny re-ordering games with entries on current->bio_list. Instead, I suggest to distinguish between recursive calls to generic_make_request(), and pushing back the remainder part in blk_queue_split(), by pointing current->bio_lists to a struct recursion_to_iteration_bio_lists { struct bio_list recursion; struct bio_list remainder; } To have all bios targeted to drivers lower in the stack processed before processing the next piece of a bio targeted at the higher levels, as long as queued bios resulting from recursion are available, they will continue to be processed in FIFO order. Pushed back bio-parts resulting from blk_queue_split() will be processed in LIFO order, one-by-one, whenever the recursion list becomes empty. Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com> Signed-off-by: Roland Kammerer <roland.kammerer@linbit.com>
-rw-r--r--block/bio.c14
-rw-r--r--block/blk-core.c32
-rw-r--r--block/blk-merge.c3
-rw-r--r--drivers/md/bcache/btree.c12
-rw-r--r--drivers/md/dm-bufio.c2
-rw-r--r--drivers/md/md.h7
-rw-r--r--drivers/md/raid1.c5
-rw-r--r--drivers/md/raid10.c5
-rw-r--r--include/linux/bio.h11
-rw-r--r--include/linux/sched.h4
10 files changed, 57 insertions, 38 deletions
diff --git a/block/bio.c b/block/bio.c
index 848cd351513ba..a2bfc7bf1e25b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -368,10 +368,10 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
bio_list_init(&punt);
bio_list_init(&nopunt);
- while ((bio = bio_list_pop(current->bio_list)))
+ while ((bio = bio_list_pop(&current->bio_lists->recursion)))
bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
- *current->bio_list = nopunt;
+ current->bio_lists->recursion = nopunt;
spin_lock(&bs->rescue_lock);
bio_list_merge(&bs->rescue_list, &punt);
@@ -453,13 +453,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
*
* We solve this, and guarantee forward progress, with a rescuer
* workqueue per bio_set. If we go to allocate and there are
- * bios on current->bio_list, we first try the allocation
- * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
- * bios we would be blocking to the rescuer workqueue before
- * we retry with the original gfp_flags.
+ * bios on current->bio_lists->recursion, we first try the
+ * allocation without __GFP_DIRECT_RECLAIM; if that fails, we
+ * punt those bios we would be blocking to the rescuer
+ * workqueue before we retry with the original gfp_flags.
*/
- if (current->bio_list && !bio_list_empty(current->bio_list))
+ if (current->bio_lists && !bio_list_empty(&current->bio_lists->recursion))
gfp_mask &= ~__GFP_DIRECT_RECLAIM;
p = mempool_alloc(bs->bio_pool, gfp_mask);
diff --git a/block/blk-core.c b/block/blk-core.c
index 3cfd67d006fb4..ecdff1e1a673c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2040,7 +2040,7 @@ end_io:
*/
blk_qc_t generic_make_request(struct bio *bio)
{
- struct bio_list bio_list_on_stack;
+ struct recursion_to_iteration_bio_lists bio_lists_on_stack;
blk_qc_t ret = BLK_QC_T_NONE;
if (!generic_make_request_checks(bio))
@@ -2049,15 +2049,18 @@ blk_qc_t generic_make_request(struct bio *bio)
/*
* We only want one ->make_request_fn to be active at a time, else
* stack usage with stacked devices could be a problem. So use
- * current->bio_list to keep a list of requests submited by a
- * make_request_fn function. current->bio_list is also used as a
+ * current->bio_lists to keep a list of requests submited by a
+ * make_request_fn function. current->bio_lists is also used as a
* flag to say if generic_make_request is currently active in this
* task or not. If it is NULL, then no make_request is active. If
* it is non-NULL, then a make_request is active, and new requests
- * should be added at the tail
+ * should be added at the tail of current->bio_lists->recursion;
+ * remainder bios resulting from a call to bio_queue_split() from
+ * within ->make_request_fn() should be added to the head of
+ * current->bio_lists->remainder.
*/
- if (current->bio_list) {
- bio_list_add(current->bio_list, bio);
+ if (current->bio_lists) {
+ bio_list_add(&current->bio_lists->recursion, bio);
goto out;
}
@@ -2066,7 +2069,7 @@ blk_qc_t generic_make_request(struct bio *bio)
* Before entering the loop, bio->bi_next is NULL (as all callers
* ensure that) so we have a list with a single bio.
* We pretend that we have just taken it off a longer list, so
- * we assign bio_list to a pointer to the bio_list_on_stack,
+ * we assign bio_list to a pointer to the bio_lists_on_stack,
* thus initialising the bio_list of new bios to be
* added. ->make_request() may indeed add some more bios
* through a recursive call to generic_make_request. If it
@@ -2076,8 +2079,9 @@ blk_qc_t generic_make_request(struct bio *bio)
* bio_list, and call into ->make_request() again.
*/
BUG_ON(bio->bi_next);
- bio_list_init(&bio_list_on_stack);
- current->bio_list = &bio_list_on_stack;
+ bio_list_init(&bio_lists_on_stack.recursion);
+ bio_list_init(&bio_lists_on_stack.remainder);
+ current->bio_lists = &bio_lists_on_stack;
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -2085,16 +2089,14 @@ blk_qc_t generic_make_request(struct bio *bio)
ret = q->make_request_fn(q, bio);
blk_queue_exit(q);
-
- bio = bio_list_pop(current->bio_list);
} else {
- struct bio *bio_next = bio_list_pop(current->bio_list);
-
bio_io_error(bio);
- bio = bio_next;
}
+ bio = bio_list_pop(&current->bio_lists->recursion);
+ if (!bio)
+ bio = bio_list_pop(&current->bio_lists->remainder);
} while (bio);
- current->bio_list = NULL; /* deactivate */
+ current->bio_lists = NULL; /* deactivate */
out:
return ret;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index c265348b75d1a..1e1bbe6c6dafe 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -172,6 +172,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
struct bio *split, *res;
unsigned nsegs;
+ BUG_ON(!current->bio_lists);
if (bio_op(*bio) == REQ_OP_DISCARD)
split = blk_bio_discard_split(q, *bio, bs, &nsegs);
else if (bio_op(*bio) == REQ_OP_WRITE_SAME)
@@ -190,7 +191,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
bio_chain(split, *bio);
trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
- generic_make_request(*bio);
+ bio_list_add_head(&current->bio_lists->remainder, *bio);
*bio = split;
}
}
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 76f7534d1dd15..731ec3b84df4a 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -450,7 +450,7 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent)
trace_bcache_btree_write(b);
- BUG_ON(current->bio_list);
+ BUG_ON(current->bio_lists);
BUG_ON(b->written >= btree_blocks(b));
BUG_ON(b->written && !i->keys);
BUG_ON(btree_bset_first(b)->seq != i->seq);
@@ -544,7 +544,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
/* Force write if set is too big */
if (set_bytes(i) > PAGE_SIZE - 48 &&
- !current->bio_list)
+ !current->bio_lists)
bch_btree_node_write(b, NULL);
}
@@ -889,7 +889,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
{
struct btree *b;
- BUG_ON(current->bio_list);
+ BUG_ON(current->bio_lists);
lockdep_assert_held(&c->bucket_lock);
@@ -976,7 +976,7 @@ retry:
b = mca_find(c, k);
if (!b) {
- if (current->bio_list)
+ if (current->bio_lists)
return ERR_PTR(-EAGAIN);
mutex_lock(&c->bucket_lock);
@@ -2127,7 +2127,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
return 0;
split:
- if (current->bio_list) {
+ if (current->bio_lists) {
op->lock = b->c->root->level + 1;
return -EAGAIN;
} else if (op->lock <= b->c->root->level) {
@@ -2209,7 +2209,7 @@ int bch_btree_insert(struct cache_set *c, struct keylist *keys,
struct btree_insert_op op;
int ret = 0;
- BUG_ON(current->bio_list);
+ BUG_ON(current->bio_lists);
BUG_ON(bch_keylist_empty(keys));
bch_btree_op_init(&op.op, 0);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 6571c81465e18..ba0c325815a76 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -174,7 +174,7 @@ static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
#define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)])
#define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)])
-#define dm_bufio_in_request() (!!current->bio_list)
+#define dm_bufio_in_request() (!!current->bio_lists)
static void dm_bufio_lock(struct dm_bufio_client *c)
{
diff --git a/drivers/md/md.h b/drivers/md/md.h
index b4f335245bd60..ed1661c2a956a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -664,6 +664,13 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
}
}
+static inline bool current_has_pending_bios(void)
+{
+ return current->bio_lists && (
+ !bio_list_empty(&current->bio_lists->recursion) ||
+ !bio_list_empty(&current->bio_lists->remainder));
+}
+
extern struct md_cluster_operations *md_cluster_ops;
static inline int mddev_is_clustered(struct mddev *mddev)
{
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 10e53cd6a995a..38790e367bd30 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -876,8 +876,7 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
(!conf->barrier ||
((conf->start_next_window <
conf->next_resync + RESYNC_SECTORS) &&
- current->bio_list &&
- !bio_list_empty(current->bio_list))),
+ current_has_pending_bios())),
conf->resync_lock);
conf->nr_waiting--;
}
@@ -1014,7 +1013,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
struct r1conf *conf = mddev->private;
struct bio *bio;
- if (from_schedule || current->bio_list) {
+ if (from_schedule || current->bio_lists) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 245640b501539..13a534140f4f3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -945,8 +945,7 @@ static void wait_barrier(struct r10conf *conf)
wait_event_lock_irq(conf->wait_barrier,
!conf->barrier ||
(conf->nr_pending &&
- current->bio_list &&
- !bio_list_empty(current->bio_list)),
+ current_has_pending_bios()),
conf->resync_lock);
conf->nr_waiting--;
}
@@ -1022,7 +1021,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
struct r10conf *conf = mddev->private;
struct bio *bio;
- if (from_schedule || current->bio_list) {
+ if (from_schedule || current->bio_lists) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b7e1a00810f27..8c212864b90bb 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -541,6 +541,17 @@ struct bio_list {
struct bio *tail;
};
+/* for generic_make_request() */
+struct recursion_to_iteration_bio_lists {
+ /* For stacking drivers submitting to their respective backend.
+ * Added to tail, processed in FIFO order. */
+ struct bio_list recursion;
+ /* For pushing back the "remainder" part resulting from calling
+ * blk_queue_split(). Added to head, processed in LIFO order,
+ * once the "recursion" list has been drained. */
+ struct bio_list remainder;
+};
+
static inline int bio_list_empty(const struct bio_list *bl)
{
return bl->head == NULL;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e42ada26345d..146eedc524167 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -128,7 +128,7 @@ struct sched_attr {
struct futex_pi_state;
struct robust_list_head;
-struct bio_list;
+struct recursion_to_iteration_bio_lists;
struct fs_struct;
struct perf_event_context;
struct blk_plug;
@@ -1727,7 +1727,7 @@ struct task_struct {
void *journal_info;
/* stacked block device info */
- struct bio_list *bio_list;
+ struct recursion_to_iteration_bio_lists *bio_lists;
#ifdef CONFIG_BLOCK
/* stack plugging */