aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig5
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/async-thread.c121
-rw-r--r--fs/btrfs/async-thread.h37
-rw-r--r--fs/btrfs/backref.c51
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/block-group.c3189
-rw-r--r--fs/btrfs/block-group.h251
-rw-r--r--fs/btrfs/block-rsv.c437
-rw-r--r--fs/btrfs/block-rsv.h101
-rw-r--r--fs/btrfs/btrfs_inode.h33
-rw-r--r--fs/btrfs/check-integrity.c18
-rw-r--r--fs/btrfs/compression.c356
-rw-r--r--fs/btrfs/compression.h60
-rw-r--r--fs/btrfs/ctree.c953
-rw-r--r--fs/btrfs/ctree.h772
-rw-r--r--fs/btrfs/dedupe.h12
-rw-r--r--fs/btrfs/delalloc-space.c490
-rw-r--r--fs/btrfs/delalloc-space.h23
-rw-r--r--fs/btrfs/delayed-inode.c41
-rw-r--r--fs/btrfs/delayed-ref.c227
-rw-r--r--fs/btrfs/delayed-ref.h132
-rw-r--r--fs/btrfs/dev-replace.c82
-rw-r--r--fs/btrfs/dev-replace.h5
-rw-r--r--fs/btrfs/dir-item.c5
-rw-r--r--fs/btrfs/disk-io.c793
-rw-r--r--fs/btrfs/disk-io.h15
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-io-tree.h248
-rw-r--r--fs/btrfs/extent-tree.c6413
-rw-r--r--fs/btrfs/extent_io.c675
-rw-r--r--fs/btrfs/extent_io.h236
-rw-r--r--fs/btrfs/extent_map.c46
-rw-r--r--fs/btrfs/extent_map.h11
-rw-r--r--fs/btrfs/file-item.c83
-rw-r--r--fs/btrfs/file.c632
-rw-r--r--fs/btrfs/free-space-cache.c209
-rw-r--r--fs/btrfs/free-space-cache.h71
-rw-r--r--fs/btrfs/free-space-tree.c158
-rw-r--r--fs/btrfs/free-space-tree.h21
-rw-r--r--fs/btrfs/inode-item.c70
-rw-r--r--fs/btrfs/inode-map.c37
-rw-r--r--fs/btrfs/inode.c1168
-rw-r--r--fs/btrfs/ioctl.c711
-rw-r--r--fs/btrfs/locking.c470
-rw-r--r--fs/btrfs/locking.h15
-rw-r--r--fs/btrfs/lzo.c61
-rw-r--r--fs/btrfs/math.h28
-rw-r--r--fs/btrfs/misc.h61
-rw-r--r--fs/btrfs/ordered-data.c79
-rw-r--r--fs/btrfs/ordered-data.h13
-rw-r--r--fs/btrfs/print-tree.c14
-rw-r--r--fs/btrfs/props.c268
-rw-r--r--fs/btrfs/props.h7
-rw-r--r--fs/btrfs/qgroup.c148
-rw-r--r--fs/btrfs/qgroup.h2
-rw-r--r--fs/btrfs/raid56.c120
-rw-r--r--fs/btrfs/raid56.h4
-rw-r--r--fs/btrfs/reada.c54
-rw-r--r--fs/btrfs/ref-verify.c71
-rw-r--r--fs/btrfs/ref-verify.h10
-rw-r--r--fs/btrfs/relocation.c259
-rw-r--r--fs/btrfs/root-tree.c83
-rw-r--r--fs/btrfs/scrub.c186
-rw-r--r--fs/btrfs/send.c683
-rw-r--r--fs/btrfs/space-info.c1115
-rw-r--r--fs/btrfs/space-info.h142
-rw-r--r--fs/btrfs/struct-funcs.c73
-rw-r--r--fs/btrfs/super.c91
-rw-r--r--fs/btrfs/sysfs.c337
-rw-r--r--fs/btrfs/sysfs.h82
-rw-r--r--fs/btrfs/tests/btrfs-tests.c52
-rw-r--r--fs/btrfs/tests/btrfs-tests.h21
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c8
-rw-r--r--fs/btrfs/tests/extent-io-tests.c167
-rw-r--r--fs/btrfs/tests/extent-map-tests.c235
-rw-r--r--fs/btrfs/tests/free-space-tests.c27
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c124
-rw-r--r--fs/btrfs/tests/inode-tests.c58
-rw-r--r--fs/btrfs/tests/qgroup-tests.c24
-rw-r--r--fs/btrfs/transaction.c218
-rw-r--r--fs/btrfs/transaction.h13
-rw-r--r--fs/btrfs/tree-checker.c1196
-rw-r--r--fs/btrfs/tree-checker.h11
-rw-r--r--fs/btrfs/tree-log.c627
-rw-r--r--fs/btrfs/tree-log.h10
-rw-r--r--fs/btrfs/uuid-tree.c8
-rw-r--r--fs/btrfs/volumes.c1453
-rw-r--r--fs/btrfs/volumes.h119
-rw-r--r--fs/btrfs/xattr.c67
-rw-r--r--fs/btrfs/xattr.h7
-rw-r--r--fs/btrfs/zlib.c63
-rw-r--r--fs/btrfs/zstd.c84
94 files changed, 15347 insertions, 12735 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 23537bc8c827..575636f6491e 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -2,7 +2,12 @@
config BTRFS_FS
tristate "Btrfs filesystem support"
+ select CRYPTO
+ select CRYPTO_CRC32C
select LIBCRC32C
+ select CRYPTO_XXHASH
+ select CRYPTO_SHA256
+ select CRYPTO_BLAKE2B
select ZLIB_INFLATE
select ZLIB_DEFLATE
select LZO_COMPRESS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ca693dd554e9..82200dbca5ac 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -10,7 +10,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- uuid-tree.o props.o free-space-tree.o tree-checker.o
+ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
+ block-rsv.o delalloc-space.o block-group.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 5810463dc6d2..a0af1b952c4d 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -93,7 +93,11 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
goto out;
}
- ret = btrfs_setxattr(trans, inode, name, value, size, 0);
+ if (trans)
+ ret = btrfs_setxattr(trans, inode, name, value, size, 0);
+ else
+ ret = btrfs_setxattr_trans(inode, name, value, size, 0);
+
out:
kfree(value);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 122cb97c7909..1d32a07bb2d1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -12,9 +12,11 @@
#include "async-thread.h"
#include "ctree.h"
-#define WORK_DONE_BIT 0
-#define WORK_ORDER_DONE_BIT 1
-#define WORK_HIGH_PRIO_BIT 2
+enum {
+ WORK_DONE_BIT,
+ WORK_ORDER_DONE_BIT,
+ WORK_HIGH_PRIO_BIT,
+};
#define NO_THRESHOLD (-1)
#define DFT_THRESHOLD (32)
@@ -51,24 +53,12 @@ struct btrfs_workqueue {
struct __btrfs_workqueue *high;
};
-static void normal_work_helper(struct btrfs_work *work);
-
-#define BTRFS_WORK_HELPER(name) \
-noinline_for_stack void btrfs_##name(struct work_struct *arg) \
-{ \
- struct btrfs_work *work = container_of(arg, struct btrfs_work, \
- normal_work); \
- normal_work_helper(work); \
-}
-
-struct btrfs_fs_info *
-btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
{
return wq->fs_info;
}
-struct btrfs_fs_info *
-btrfs_work_owner(const struct btrfs_work *work)
+struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
{
return work->wq->fs_info;
}
@@ -87,29 +77,6 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
}
-BTRFS_WORK_HELPER(worker_helper);
-BTRFS_WORK_HELPER(delalloc_helper);
-BTRFS_WORK_HELPER(flush_delalloc_helper);
-BTRFS_WORK_HELPER(cache_helper);
-BTRFS_WORK_HELPER(submit_helper);
-BTRFS_WORK_HELPER(fixup_helper);
-BTRFS_WORK_HELPER(endio_helper);
-BTRFS_WORK_HELPER(endio_meta_helper);
-BTRFS_WORK_HELPER(endio_meta_write_helper);
-BTRFS_WORK_HELPER(endio_raid56_helper);
-BTRFS_WORK_HELPER(endio_repair_helper);
-BTRFS_WORK_HELPER(rmw_helper);
-BTRFS_WORK_HELPER(endio_write_helper);
-BTRFS_WORK_HELPER(freespace_write_helper);
-BTRFS_WORK_HELPER(delayed_meta_helper);
-BTRFS_WORK_HELPER(readahead_helper);
-BTRFS_WORK_HELPER(qgroup_rescan_helper);
-BTRFS_WORK_HELPER(extent_refs_helper);
-BTRFS_WORK_HELPER(scrub_helper);
-BTRFS_WORK_HELPER(scrubwrc_helper);
-BTRFS_WORK_HELPER(scrubnc_helper);
-BTRFS_WORK_HELPER(scrubparity_helper);
-
static struct __btrfs_workqueue *
__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
unsigned int flags, int limit_active, int thresh)
@@ -250,16 +217,16 @@ out:
}
}
-static void run_ordered_work(struct __btrfs_workqueue *wq)
+static void run_ordered_work(struct __btrfs_workqueue *wq,
+ struct btrfs_work *self)
{
struct list_head *list = &wq->ordered_list;
struct btrfs_work *work;
spinlock_t *lock = &wq->list_lock;
unsigned long flags;
+ bool free_self = false;
while (1) {
- void *wtag;
-
spin_lock_irqsave(lock, flags);
if (list_empty(list))
break;
@@ -285,22 +252,53 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
list_del(&work->ordered_list);
spin_unlock_irqrestore(lock, flags);
- /*
- * We don't want to call the ordered free functions with the
- * lock held though. Save the work as tag for the trace event,
- * because the callback could free the structure.
- */
- wtag = work;
- work->ordered_free(work);
- trace_btrfs_all_work_done(wq->fs_info, wtag);
+ if (work == self) {
+ /*
+ * This is the work item that the worker is currently
+ * executing.
+ *
+ * The kernel workqueue code guarantees non-reentrancy
+ * of work items. I.e., if a work item with the same
+ * address and work function is queued twice, the second
+ * execution is blocked until the first one finishes. A
+ * work item may be freed and recycled with the same
+ * work function; the workqueue code assumes that the
+ * original work item cannot depend on the recycled work
+ * item in that case (see find_worker_executing_work()).
+ *
+ * Note that different types of Btrfs work can depend on
+ * each other, and one type of work on one Btrfs
+ * filesystem may even depend on the same type of work
+ * on another Btrfs filesystem via, e.g., a loop device.
+ * Therefore, we must not allow the current work item to
+ * be recycled until we are really done, otherwise we
+ * break the above assumption and can deadlock.
+ */
+ free_self = true;
+ } else {
+ /*
+ * We don't want to call the ordered free functions with
+ * the lock held.
+ */
+ work->ordered_free(work);
+ /* NB: work must not be dereferenced past this point. */
+ trace_btrfs_all_work_done(wq->fs_info, work);
+ }
}
spin_unlock_irqrestore(lock, flags);
+
+ if (free_self) {
+ self->ordered_free(self);
+ /* NB: self must not be dereferenced past this point. */
+ trace_btrfs_all_work_done(wq->fs_info, self);
+ }
}
-static void normal_work_helper(struct btrfs_work *work)
+static void btrfs_work_helper(struct work_struct *normal_work)
{
+ struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
+ normal_work);
struct __btrfs_workqueue *wq;
- void *wtag;
int need_order = 0;
/*
@@ -314,29 +312,26 @@ static void normal_work_helper(struct btrfs_work *work)
if (work->ordered_func)
need_order = 1;
wq = work->wq;
- /* Safe for tracepoints in case work gets freed by the callback */
- wtag = work;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
work->func(work);
if (need_order) {
set_bit(WORK_DONE_BIT, &work->flags);
- run_ordered_work(wq);
+ run_ordered_work(wq, work);
+ } else {
+ /* NB: work must not be dereferenced past this point. */
+ trace_btrfs_all_work_done(wq->fs_info, work);
}
- if (!need_order)
- trace_btrfs_all_work_done(wq->fs_info, wtag);
}
-void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
- btrfs_func_t func,
- btrfs_func_t ordered_func,
- btrfs_func_t ordered_free)
+void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
+ btrfs_func_t ordered_func, btrfs_func_t ordered_free)
{
work->func = func;
work->ordered_func = ordered_func;
work->ordered_free = ordered_free;
- INIT_WORK(&work->normal_work, uniq_func);
+ INIT_WORK(&work->normal_work, btrfs_work_helper);
INIT_LIST_HEAD(&work->ordered_list);
work->flags = 0;
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 7861c9feba5f..a4434301d84d 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -29,49 +29,20 @@ struct btrfs_work {
unsigned long flags;
};
-#define BTRFS_WORK_HELPER_PROTO(name) \
-void btrfs_##name(struct work_struct *arg)
-
-BTRFS_WORK_HELPER_PROTO(worker_helper);
-BTRFS_WORK_HELPER_PROTO(delalloc_helper);
-BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
-BTRFS_WORK_HELPER_PROTO(cache_helper);
-BTRFS_WORK_HELPER_PROTO(submit_helper);
-BTRFS_WORK_HELPER_PROTO(fixup_helper);
-BTRFS_WORK_HELPER_PROTO(endio_helper);
-BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
-BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
-BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
-BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
-BTRFS_WORK_HELPER_PROTO(rmw_helper);
-BTRFS_WORK_HELPER_PROTO(endio_write_helper);
-BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
-BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
-BTRFS_WORK_HELPER_PROTO(readahead_helper);
-BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
-BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
-BTRFS_WORK_HELPER_PROTO(scrub_helper);
-BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
-BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
-BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
-
-
struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
const char *name,
unsigned int flags,
int limit_active,
int thresh);
-void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
- btrfs_func_t func,
- btrfs_func_t ordered_func,
- btrfs_func_t ordered_free);
+void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
+ btrfs_func_t ordered_func, btrfs_func_t ordered_free);
void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
void btrfs_set_work_high_priority(struct btrfs_work *work);
-struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work);
-struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
+struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 11459fe84a29..e5d85311d5d5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -791,7 +791,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
count = node->ref_mod * -1;
break;
default:
- BUG_ON(1);
+ BUG();
}
*total_refs += count;
switch (node->type) {
@@ -1460,17 +1460,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
* callers (such as fiemap) which want to know whether the extent is
* shared but do not need a ref count.
*
- * This attempts to allocate a transaction in order to account for
- * delayed refs, but continues on even when the alloc fails.
+ * This attempts to attach to the running transaction in order to account for
+ * delayed refs, but continues on even when no running transaction exists.
*
* Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
*/
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
+int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+ struct ulist *roots, struct ulist *tmp)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
- struct ulist *tmp = NULL;
- struct ulist *roots = NULL;
struct ulist_iterator uiter;
struct ulist_node *node;
struct seq_list elem = SEQ_LIST_INIT(elem);
@@ -1481,16 +1480,15 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
.share_count = 0,
};
- tmp = ulist_alloc(GFP_NOFS);
- roots = ulist_alloc(GFP_NOFS);
- if (!tmp || !roots) {
- ulist_free(tmp);
- ulist_free(roots);
- return -ENOMEM;
- }
+ ulist_init(roots);
+ ulist_init(tmp);
- trans = btrfs_join_transaction(root);
+ trans = btrfs_join_transaction_nostart(root);
if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
trans = NULL;
down_read(&fs_info->commit_root_sem);
} else {
@@ -1523,8 +1521,9 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
} else {
up_read(&fs_info->commit_root_sem);
}
- ulist_free(tmp);
- ulist_free(roots);
+out:
+ ulist_release(roots);
+ ulist_release(tmp);
return ret;
}
@@ -1747,7 +1746,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
else if (flags & BTRFS_EXTENT_FLAG_DATA)
*flags_ret = BTRFS_EXTENT_FLAG_DATA;
else
- BUG_ON(1);
+ BUG();
return 0;
}
@@ -1912,13 +1911,19 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
extent_item_objectid);
if (!search_commit_root) {
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ trans = btrfs_attach_transaction(fs_info->extent_root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT &&
+ PTR_ERR(trans) != -EROFS)
+ return PTR_ERR(trans);
+ trans = NULL;
+ }
+ }
+
+ if (trans)
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
- } else {
+ else
down_read(&fs_info->commit_root_sem);
- }
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
tree_mod_seq_elem.seq, &refs,
@@ -1951,7 +1956,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
free_leaf_list(refs);
out:
- if (!search_commit_root) {
+ if (trans) {
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
btrfs_end_transaction(trans);
} else {
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 54d58988483a..777f61dc081e 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -57,7 +57,8 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr);
+int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+ struct ulist *roots, struct ulist *tmp_ulist);
int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
new file mode 100644
index 000000000000..6934a5b8708f
--- /dev/null
+++ b/fs/btrfs/block-group.c
@@ -0,0 +1,3189 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "misc.h"
+#include "ctree.h"
+#include "block-group.h"
+#include "space-info.h"
+#include "disk-io.h"
+#include "free-space-cache.h"
+#include "free-space-tree.h"
+#include "disk-io.h"
+#include "volumes.h"
+#include "transaction.h"
+#include "ref-verify.h"
+#include "sysfs.h"
+#include "tree-log.h"
+#include "delalloc-space.h"
+
+/*
+ * Return target flags in extended format or 0 if restripe for this chunk_type
+ * is not in progress
+ *
+ * Should be called with balance_lock held
+ */
+static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ u64 target = 0;
+
+ if (!bctl)
+ return 0;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA &&
+ bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+ bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+ bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+ }
+
+ return target;
+}
+
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Return reduced profile in chunk format. If profile changing is in progress
+ * (either running or paused) picks the target profile (if it's already
+ * available), otherwise falls back to plain reducing.
+ */
+static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 num_devices = fs_info->fs_devices->rw_devices;
+ u64 target;
+ u64 raid_type;
+ u64 allowed = 0;
+
+ /*
+ * See if restripe for this chunk_type is in progress, if so try to
+ * reduce to the target profile
+ */
+ spin_lock(&fs_info->balance_lock);
+ target = get_restripe_target(fs_info, flags);
+ if (target) {
+ /* Pick target profile only if it's already available */
+ if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
+ spin_unlock(&fs_info->balance_lock);
+ return extended_to_chunk(target);
+ }
+ }
+ spin_unlock(&fs_info->balance_lock);
+
+ /* First, mask out the RAID levels which aren't possible */
+ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+ allowed |= btrfs_raid_array[raid_type].bg_flag;
+ }
+ allowed &= flags;
+
+ if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+ allowed = BTRFS_BLOCK_GROUP_RAID6;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+ allowed = BTRFS_BLOCK_GROUP_RAID5;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+ allowed = BTRFS_BLOCK_GROUP_RAID10;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+ allowed = BTRFS_BLOCK_GROUP_RAID1;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+ allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+ flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ return extended_to_chunk(flags | allowed);
+}
+
+static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
+{
+ unsigned seq;
+ u64 flags;
+
+ do {
+ flags = orig_flags;
+ seq = read_seqbegin(&fs_info->profiles_lock);
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ flags |= fs_info->avail_data_alloc_bits;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ flags |= fs_info->avail_system_alloc_bits;
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ flags |= fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&fs_info->profiles_lock, seq));
+
+ return btrfs_reduce_alloc_profile(fs_info, flags);
+}
+
+u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
+{
+ return get_alloc_profile(fs_info, orig_flags);
+}
+
+void btrfs_get_block_group(struct btrfs_block_group *cache)
+{
+ atomic_inc(&cache->count);
+}
+
+void btrfs_put_block_group(struct btrfs_block_group *cache)
+{
+ if (atomic_dec_and_test(&cache->count)) {
+ WARN_ON(cache->pinned > 0);
+ WARN_ON(cache->reserved > 0);
+
+ /*
+ * If not empty, someone is still holding mutex of
+ * full_stripe_lock, which can only be released by caller.
+ * And it will definitely cause use-after-free when caller
+ * tries to release full stripe lock.
+ *
+ * No better way to resolve, but only to warn.
+ */
+ WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+ }
+}
+
+/*
+ * This adds the block group to the fs_info rb tree for the block group cache
+ */
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct btrfs_block_group *cache;
+
+ spin_lock(&info->block_group_cache_lock);
+ p = &info->block_group_cache_tree.rb_node;
+
+ while (*p) {
+ parent = *p;
+ cache = rb_entry(parent, struct btrfs_block_group, cache_node);
+ if (block_group->start < cache->start) {
+ p = &(*p)->rb_left;
+ } else if (block_group->start > cache->start) {
+ p = &(*p)->rb_right;
+ } else {
+ spin_unlock(&info->block_group_cache_lock);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&block_group->cache_node, parent, p);
+ rb_insert_color(&block_group->cache_node,
+ &info->block_group_cache_tree);
+
+ if (info->first_logical_byte > block_group->start)
+ info->first_logical_byte = block_group->start;
+
+ spin_unlock(&info->block_group_cache_lock);
+
+ return 0;
+}
+
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group *block_group_cache_tree_search(
+ struct btrfs_fs_info *info, u64 bytenr, int contains)
+{
+ struct btrfs_block_group *cache, *ret = NULL;
+ struct rb_node *n;
+ u64 end, start;
+
+ spin_lock(&info->block_group_cache_lock);
+ n = info->block_group_cache_tree.rb_node;
+
+ while (n) {
+ cache = rb_entry(n, struct btrfs_block_group, cache_node);
+ end = cache->start + cache->length - 1;
+ start = cache->start;
+
+ if (bytenr < start) {
+ if (!contains && (!ret || start < ret->start))
+ ret = cache;
+ n = n->rb_left;
+ } else if (bytenr > start) {
+ if (contains && bytenr <= end) {
+ ret = cache;
+ break;
+ }
+ n = n->rb_right;
+ } else {
+ ret = cache;
+ break;
+ }
+ }
+ if (ret) {
+ btrfs_get_block_group(ret);
+ if (bytenr == 0 && info->first_logical_byte > ret->start)
+ info->first_logical_byte = ret->start;
+ }
+ spin_unlock(&info->block_group_cache_lock);
+
+ return ret;
+}
+
+/*
+ * Return the block group that starts at or after bytenr
+ */
+struct btrfs_block_group *btrfs_lookup_first_block_group(
+ struct btrfs_fs_info *info, u64 bytenr)
+{
+ return block_group_cache_tree_search(info, bytenr, 0);
+}
+
+/*
+ * Return the block group that contains the given bytenr
+ */
+struct btrfs_block_group *btrfs_lookup_block_group(
+ struct btrfs_fs_info *info, u64 bytenr)
+{
+ return block_group_cache_tree_search(info, bytenr, 1);
+}
+
+struct btrfs_block_group *btrfs_next_block_group(
+ struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct rb_node *node;
+
+ spin_lock(&fs_info->block_group_cache_lock);
+
+ /* If our block group was removed, we need a full search. */
+ if (RB_EMPTY_NODE(&cache->cache_node)) {
+ const u64 next_bytenr = cache->start + cache->length;
+
+ spin_unlock(&fs_info->block_group_cache_lock);
+ btrfs_put_block_group(cache);
+ cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
+ }
+ node = rb_next(&cache->cache_node);
+ btrfs_put_block_group(cache);
+ if (node) {
+ cache = rb_entry(node, struct btrfs_block_group, cache_node);
+ btrfs_get_block_group(cache);
+ } else
+ cache = NULL;
+ spin_unlock(&fs_info->block_group_cache_lock);
+ return cache;
+}
+
+bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group *bg;
+ bool ret = true;
+
+ bg = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg)
+ return false;
+
+ spin_lock(&bg->lock);
+ if (bg->ro)
+ ret = false;
+ else
+ atomic_inc(&bg->nocow_writers);
+ spin_unlock(&bg->lock);
+
+ /* No put on block group, done by btrfs_dec_nocow_writers */
+ if (!ret)
+ btrfs_put_block_group(bg);
+
+ return ret;
+}
+
+void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group *bg;
+
+ bg = btrfs_lookup_block_group(fs_info, bytenr);
+ ASSERT(bg);
+ if (atomic_dec_and_test(&bg->nocow_writers))
+ wake_up_var(&bg->nocow_writers);
+ /*
+ * Once for our lookup and once for the lookup done by a previous call
+ * to btrfs_inc_nocow_writers()
+ */
+ btrfs_put_block_group(bg);
+ btrfs_put_block_group(bg);
+}
+
+void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
+{
+ wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
+}
+
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+ const u64 start)
+{
+ struct btrfs_block_group *bg;
+
+ bg = btrfs_lookup_block_group(fs_info, start);
+ ASSERT(bg);
+ if (atomic_dec_and_test(&bg->reservations))
+ wake_up_var(&bg->reservations);
+ btrfs_put_block_group(bg);
+}
+
+void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
+{
+ struct btrfs_space_info *space_info = bg->space_info;
+
+ ASSERT(bg->ro);
+
+ if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
+ return;
+
+ /*
+ * Our block group is read only but before we set it to read only,
+ * some task might have had allocated an extent from it already, but it
+ * has not yet created a respective ordered extent (and added it to a
+ * root's list of ordered extents).
+ * Therefore wait for any task currently allocating extents, since the
+ * block group's reservations counter is incremented while a read lock
+ * on the groups' semaphore is held and decremented after releasing
+ * the read access on that semaphore and creating the ordered extent.
+ */
+ down_write(&space_info->groups_sem);
+ up_write(&space_info->groups_sem);
+
+ wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
+}
+
+struct btrfs_caching_control *btrfs_get_caching_control(
+ struct btrfs_block_group *cache)
+{
+ struct btrfs_caching_control *ctl;
+
+ spin_lock(&cache->lock);
+ if (!cache->caching_ctl) {
+ spin_unlock(&cache->lock);
+ return NULL;
+ }
+
+ ctl = cache->caching_ctl;
+ refcount_inc(&ctl->count);
+ spin_unlock(&cache->lock);
+ return ctl;
+}
+
+void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
+{
+ if (refcount_dec_and_test(&ctl->count))
+ kfree(ctl);
+}
+
+/*
+ * When we wait for progress in the block group caching, its because our
+ * allocation attempt failed at least once. So, we must sleep and let some
+ * progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to show
+ * up, and then it will check the block group free space numbers for our min
+ * num_bytes. Another option is to have it go ahead and look in the rbtree for
+ * a free extent of a given size, but this is a good start.
+ *
+ * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
+ * any of the information in this block group.
+ */
+void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
+ u64 num_bytes)
+{
+ struct btrfs_caching_control *caching_ctl;
+
+ caching_ctl = btrfs_get_caching_control(cache);
+ if (!caching_ctl)
+ return;
+
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
+ (cache->free_space_ctl->free_space >= num_bytes));
+
+ btrfs_put_caching_control(caching_ctl);
+}
+
+int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
+{
+ struct btrfs_caching_control *caching_ctl;
+ int ret = 0;
+
+ caching_ctl = btrfs_get_caching_control(cache);
+ if (!caching_ctl)
+ return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
+
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
+ if (cache->cached == BTRFS_CACHE_ERROR)
+ ret = -EIO;
+ btrfs_put_caching_control(caching_ctl);
+ return ret;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ u64 start = block_group->start;
+ u64 len = block_group->length;
+ u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+ fs_info->nodesize : fs_info->sectorsize;
+ u64 step = chunk << 1;
+
+ while (len > chunk) {
+ btrfs_remove_free_space(block_group, start, chunk);
+ start += step;
+ if (len < step)
+ len = 0;
+ else
+ len -= step;
+ }
+}
+#endif
+
+/*
+ * This is only called by btrfs_cache_block_group, since we could have freed
+ * extents we need to check the pinned_extents for any extents that can't be
+ * used yet since their free space will be released as soon as the transaction
+ * commits.
+ */
+u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
+{
+ struct btrfs_fs_info *info = block_group->fs_info;
+ u64 extent_start, extent_end, size, total_added = 0;
+ int ret;
+
+ while (start < end) {
+ ret = find_first_extent_bit(info->pinned_extents, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY | EXTENT_UPTODATE,
+ NULL);
+ if (ret)
+ break;
+
+ if (extent_start <= start) {
+ start = extent_end + 1;
+ } else if (extent_start > start && extent_start < end) {
+ size = extent_start - start;
+ total_added += size;
+ ret = btrfs_add_free_space(block_group, start,
+ size);
+ BUG_ON(ret); /* -ENOMEM or logic error */
+ start = extent_end + 1;
+ } else {
+ break;
+ }
+ }
+
+ if (start < end) {
+ size = end - start;
+ total_added += size;
+ ret = btrfs_add_free_space(block_group, start, size);
+ BUG_ON(ret); /* -ENOMEM or logic error */
+ }
+
+ return total_added;
+}
+
+static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
+{
+ struct btrfs_block_group *block_group = caching_ctl->block_group;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 total_found = 0;
+ u64 last = 0;
+ u32 nritems;
+ int ret;
+ bool wakeup = true;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
+
+#ifdef CONFIG_BTRFS_DEBUG
+ /*
+ * If we're fragmenting we don't want to make anybody think we can
+ * allocate from this block group until we've had a chance to fragment
+ * the free space.
+ */
+ if (btrfs_should_fragment_free_space(block_group))
+ wakeup = false;
+#endif
+ /*
+ * We don't want to deadlock with somebody trying to allocate a new
+ * extent for the extent root while also trying to search the extent
+ * root to add free space. So we skip locking and search the commit
+ * root, since its read-only
+ */
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ path->reada = READA_FORWARD;
+
+ key.objectid = last;
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
+next:
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ while (1) {
+ if (btrfs_fs_closing(fs_info) > 1) {
+ last = (u64)-1;
+ break;
+ }
+
+ if (path->slots[0] < nritems) {
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ } else {
+ ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
+ if (ret)
+ break;
+
+ if (need_resched() ||
+ rwsem_is_contended(&fs_info->commit_root_sem)) {
+ if (wakeup)
+ caching_ctl->progress = last;
+ btrfs_release_path(path);
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&caching_ctl->mutex);
+ cond_resched();
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+ goto next;
+ }
+
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ continue;
+ }
+
+ if (key.objectid < last) {
+ key.objectid = last;
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
+ if (wakeup)
+ caching_ctl->progress = last;
+ btrfs_release_path(path);
+ goto next;
+ }
+
+ if (key.objectid < block_group->start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ if (key.objectid >= block_group->start + block_group->length)
+ break;
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY) {
+ total_found += add_new_free_space(block_group, last,
+ key.objectid);
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ last = key.objectid +
+ fs_info->nodesize;
+ else
+ last = key.objectid + key.offset;
+
+ if (total_found > CACHING_CTL_WAKE_UP) {
+ total_found = 0;
+ if (wakeup)
+ wake_up(&caching_ctl->wait);
+ }
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+
+ total_found += add_new_free_space(block_group, last,
+ block_group->start + block_group->length);
+ caching_ctl->progress = (u64)-1;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ int ret;
+
+ caching_ctl = container_of(work, struct btrfs_caching_control, work);
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ ret = load_free_space_tree(caching_ctl);
+ else
+ ret = load_extent_tree_free(caching_ctl);
+
+ spin_lock(&block_group->lock);
+ block_group->caching_ctl = NULL;
+ block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
+ spin_unlock(&block_group->lock);
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(block_group)) {
+ u64 bytes_used;
+
+ spin_lock(&block_group->space_info->lock);
+ spin_lock(&block_group->lock);
+ bytes_used = block_group->length - block_group->used;
+ block_group->space_info->bytes_used += bytes_used >> 1;
+ spin_unlock(&block_group->lock);
+ spin_unlock(&block_group->space_info->lock);
+ fragment_free_space(block_group);
+ }
+#endif
+
+ caching_ctl->progress = (u64)-1;
+
+ up_read(&fs_info->commit_root_sem);
+ btrfs_free_excluded_extents(block_group);
+ mutex_unlock(&caching_ctl->mutex);
+
+ wake_up(&caching_ctl->wait);
+
+ btrfs_put_caching_control(caching_ctl);
+ btrfs_put_block_group(block_group);
+}
+
+int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
+{
+ DEFINE_WAIT(wait);
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ int ret = 0;
+
+ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+ if (!caching_ctl)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&caching_ctl->list);
+ mutex_init(&caching_ctl->mutex);
+ init_waitqueue_head(&caching_ctl->wait);
+ caching_ctl->block_group = cache;
+ caching_ctl->progress = cache->start;
+ refcount_set(&caching_ctl->count, 1);
+ btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+
+ spin_lock(&cache->lock);
+ /*
+ * This should be a rare occasion, but this could happen I think in the
+ * case where one thread starts to load the space cache info, and then
+ * some other thread starts a transaction commit which tries to do an
+ * allocation while the other thread is still loading the space cache
+ * info. The previous loop should have kept us from choosing this block
+ * group, but if we've moved to the state where we will wait on caching
+ * block groups we need to first check if we're doing a fast load here,
+ * so we can wait for it to finish, otherwise we could end up allocating
+ * from a block group who's cache gets evicted for one reason or
+ * another.
+ */
+ while (cache->cached == BTRFS_CACHE_FAST) {
+ struct btrfs_caching_control *ctl;
+
+ ctl = cache->caching_ctl;
+ refcount_inc(&ctl->count);
+ prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&cache->lock);
+
+ schedule();
+
+ finish_wait(&ctl->wait, &wait);
+ btrfs_put_caching_control(ctl);
+ spin_lock(&cache->lock);
+ }
+
+ if (cache->cached != BTRFS_CACHE_NO) {
+ spin_unlock(&cache->lock);
+ kfree(caching_ctl);
+ return 0;
+ }
+ WARN_ON(cache->caching_ctl);
+ cache->caching_ctl = caching_ctl;
+ cache->cached = BTRFS_CACHE_FAST;
+ spin_unlock(&cache->lock);
+
+ if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ mutex_lock(&caching_ctl->mutex);
+ ret = load_free_space_cache(cache);
+
+ spin_lock(&cache->lock);
+ if (ret == 1) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ cache->last_byte_to_unpin = (u64)-1;
+ caching_ctl->progress = (u64)-1;
+ } else {
+ if (load_cache_only) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_NO;
+ } else {
+ cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
+ }
+ }
+ spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (ret == 1 &&
+ btrfs_should_fragment_free_space(cache)) {
+ u64 bytes_used;
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ bytes_used = cache->length - cache->used;
+ cache->space_info->bytes_used += bytes_used >> 1;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ fragment_free_space(cache);
+ }
+#endif
+ mutex_unlock(&caching_ctl->mutex);
+
+ wake_up(&caching_ctl->wait);
+ if (ret == 1) {
+ btrfs_put_caching_control(caching_ctl);
+ btrfs_free_excluded_extents(cache);
+ return 0;
+ }
+ } else {
+ /*
+ * We're either using the free space tree or no caching at all.
+ * Set cached to the appropriate value and wakeup any waiters.
+ */
+ spin_lock(&cache->lock);
+ if (load_cache_only) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_NO;
+ } else {
+ cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
+ }
+ spin_unlock(&cache->lock);
+ wake_up(&caching_ctl->wait);
+ }
+
+ if (load_cache_only) {
+ btrfs_put_caching_control(caching_ctl);
+ return 0;
+ }
+
+ down_write(&fs_info->commit_root_sem);
+ refcount_inc(&caching_ctl->count);
+ list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
+ up_write(&fs_info->commit_root_sem);
+
+ btrfs_get_block_group(cache);
+
+ btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
+
+ return ret;
+}
+
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ write_seqlock(&fs_info->profiles_lock);
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits &= ~extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
+}
+
+/*
+ * Clear incompat bits for the following feature(s):
+ *
+ * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
+ * in the whole filesystem
+ *
+ * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
+ */
+static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ bool found_raid56 = false;
+ bool found_raid1c34 = false;
+
+ if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
+ (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
+ (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
+ struct list_head *head = &fs_info->space_info;
+ struct btrfs_space_info *sinfo;
+
+ list_for_each_entry_rcu(sinfo, head, list) {
+ down_read(&sinfo->groups_sem);
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
+ found_raid56 = true;
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
+ found_raid56 = true;
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
+ found_raid1c34 = true;
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
+ found_raid1c34 = true;
+ up_read(&sinfo->groups_sem);
+ }
+ if (found_raid56)
+ btrfs_clear_fs_incompat(fs_info, RAID56);
+ if (found_raid1c34)
+ btrfs_clear_fs_incompat(fs_info, RAID1C34);
+ }
+}
+
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ u64 group_start, struct extent_map *em)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_path *path;
+ struct btrfs_block_group *block_group;
+ struct btrfs_free_cluster *cluster;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_key key;
+ struct inode *inode;
+ struct kobject *kobj = NULL;
+ int ret;
+ int index;
+ int factor;
+ struct btrfs_caching_control *caching_ctl = NULL;
+ bool remove_em;
+ bool remove_rsv = false;
+
+ block_group = btrfs_lookup_block_group(fs_info, group_start);
+ BUG_ON(!block_group);
+ BUG_ON(!block_group->ro);
+
+ trace_btrfs_remove_block_group(block_group);
+ /*
+ * Free the reserved super bytes from this block group before
+ * remove it.
+ */
+ btrfs_free_excluded_extents(block_group);
+ btrfs_free_ref_tree_range(fs_info, block_group->start,
+ block_group->length);
+
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+ factor = btrfs_bg_type_to_factor(block_group->flags);
+
+ /* make sure this block group isn't part of an allocation cluster */
+ cluster = &fs_info->data_alloc_cluster;
+ spin_lock(&cluster->refill_lock);
+ btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&cluster->refill_lock);
+
+ /*
+ * make sure this block group isn't part of a metadata
+ * allocation cluster
+ */
+ cluster = &fs_info->meta_alloc_cluster;
+ spin_lock(&cluster->refill_lock);
+ btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&cluster->refill_lock);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * get the inode first so any iput calls done for the io_list
+ * aren't the final iput (no unlinks allowed now)
+ */
+ inode = lookup_free_space_inode(block_group, path);
+
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ /*
+ * Make sure our free space cache IO is done before removing the
+ * free space inode
+ */
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (!list_empty(&block_group->io_list)) {
+ list_del_init(&block_group->io_list);
+
+ WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ btrfs_wait_cache_io(trans, block_group, path);
+ btrfs_put_block_group(block_group);
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ }
+
+ if (!list_empty(&block_group->dirty_list)) {
+ list_del_init(&block_group->dirty_list);
+ remove_rsv = true;
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
+ if (!IS_ERR(inode)) {
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret) {
+ btrfs_add_delayed_iput(inode);
+ goto out;
+ }
+ clear_nlink(inode);
+ /* One for the block groups ref */
+ spin_lock(&block_group->lock);
+ if (block_group->iref) {
+ block_group->iref = 0;
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ /* One for our lookup ref */
+ btrfs_add_delayed_iput(inode);
+ }
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.type = 0;
+ key.offset = block_group->start;
+
+ ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ btrfs_release_path(path);
+ if (ret == 0) {
+ ret = btrfs_del_item(trans, tree_root, path);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ spin_lock(&fs_info->block_group_cache_lock);
+ rb_erase(&block_group->cache_node,
+ &fs_info->block_group_cache_tree);
+ RB_CLEAR_NODE(&block_group->cache_node);
+
+ if (fs_info->first_logical_byte == block_group->start)
+ fs_info->first_logical_byte = (u64)-1;
+ spin_unlock(&fs_info->block_group_cache_lock);
+
+ down_write(&block_group->space_info->groups_sem);
+ /*
+ * we must use list_del_init so people can check to see if they
+ * are still on the list after taking the semaphore
+ */
+ list_del_init(&block_group->list);
+ if (list_empty(&block_group->space_info->block_groups[index])) {
+ kobj = block_group->space_info->block_group_kobjs[index];
+ block_group->space_info->block_group_kobjs[index] = NULL;
+ clear_avail_alloc_bits(fs_info, block_group->flags);
+ }
+ up_write(&block_group->space_info->groups_sem);
+ clear_incompat_bg_bits(fs_info, block_group->flags);
+ if (kobj) {
+ kobject_del(kobj);
+ kobject_put(kobj);
+ }
+
+ if (block_group->has_caching_ctl)
+ caching_ctl = btrfs_get_caching_control(block_group);
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+ btrfs_wait_block_group_cache_done(block_group);
+ if (block_group->has_caching_ctl) {
+ down_write(&fs_info->commit_root_sem);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl,
+ &fs_info->caching_block_groups, list)
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ refcount_inc(&caching_ctl->count);
+ break;
+ }
+ }
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ up_write(&fs_info->commit_root_sem);
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ btrfs_put_caching_control(caching_ctl);
+ btrfs_put_caching_control(caching_ctl);
+ }
+ }
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ WARN_ON(!list_empty(&block_group->dirty_list));
+ WARN_ON(!list_empty(&block_group->io_list));
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ btrfs_remove_free_space_cache(block_group);
+
+ spin_lock(&block_group->space_info->lock);
+ list_del_init(&block_group->ro_list);
+
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ WARN_ON(block_group->space_info->total_bytes
+ < block_group->length);
+ WARN_ON(block_group->space_info->bytes_readonly
+ < block_group->length);
+ WARN_ON(block_group->space_info->disk_total
+ < block_group->length * factor);
+ }
+ block_group->space_info->total_bytes -= block_group->length;
+ block_group->space_info->bytes_readonly -= block_group->length;
+ block_group->space_info->disk_total -= block_group->length * factor;
+
+ spin_unlock(&block_group->space_info->lock);
+
+ key.objectid = block_group->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = block_group->length;
+
+ mutex_lock(&fs_info->chunk_mutex);
+ spin_lock(&block_group->lock);
+ block_group->removed = 1;
+ /*
+ * At this point trimming can't start on this block group, because we
+ * removed the block group from the tree fs_info->block_group_cache_tree
+ * so no one can't find it anymore and even if someone already got this
+ * block group before we removed it from the rbtree, they have already
+ * incremented block_group->trimming - if they didn't, they won't find
+ * any free space entries because we already removed them all when we
+ * called btrfs_remove_free_space_cache().
+ *
+ * And we must not remove the extent map from the fs_info->mapping_tree
+ * to prevent the same logical address range and physical device space
+ * ranges from being reused for a new block group. This is because our
+ * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+ * completely transactionless, so while it is trimming a range the
+ * currently running transaction might finish and a new one start,
+ * allowing for new block groups to be created that can reuse the same
+ * physical device locations unless we take this special care.
+ *
+ * There may also be an implicit trim operation if the file system
+ * is mounted with -odiscard. The same protections must remain
+ * in place until the extents have been discarded completely when
+ * the transaction commit has completed.
+ */
+ remove_em = (atomic_read(&block_group->trimming) == 0);
+ spin_unlock(&block_group->lock);
+
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ ret = remove_block_group_free_space(trans, block_group);
+ if (ret)
+ goto out;
+
+ btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -EIO;
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+
+ if (remove_em) {
+ struct extent_map_tree *em_tree;
+
+ em_tree = &fs_info->mapping_tree;
+ write_lock(&em_tree->lock);
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ /* once for the tree */
+ free_extent_map(em);
+ }
+out:
+ if (remove_rsv)
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_free_path(path);
+ return ret;
+}
+
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+ struct btrfs_fs_info *fs_info, const u64 chunk_offset)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ unsigned int num_items;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+ ASSERT(em && em->start == chunk_offset);
+
+ /*
+ * We need to reserve 3 + N units from the metadata space info in order
+ * to remove a block group (done at btrfs_remove_chunk() and at
+ * btrfs_remove_block_group()), which are used for:
+ *
+ * 1 unit for adding the free space inode's orphan (located in the tree
+ * of tree roots).
+ * 1 unit for deleting the block group item (located in the extent
+ * tree).
+ * 1 unit for deleting the free space item (located in tree of tree
+ * roots).
+ * N units for deleting N device extent items corresponding to each
+ * stripe (located in the device tree).
+ *
+ * In order to remove a block group we also need to reserve units in the
+ * system space info in order to update the chunk tree (update one or
+ * more device items and remove one chunk item), but this is done at
+ * btrfs_remove_chunk() through a call to check_system_chunk().
+ */
+ map = em->map_lookup;
+ num_items = 3 + map->num_stripes;
+ free_extent_map(em);
+
+ return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+ num_items, 1);
+}
+
+/*
+ * Mark block group @cache read-only, so later write won't happen to block
+ * group @cache.
+ *
+ * If @force is not set, this function will only mark the block group readonly
+ * if we have enough free space (1M) in other metadata/system block groups.
+ * If @force is not set, this function will mark the block group readonly
+ * without checking free space.
+ *
+ * NOTE: This function doesn't care if other block groups can contain all the
+ * data in this block group. That check should be done by relocation routine,
+ * not this function.
+ */
+static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
+{
+ struct btrfs_space_info *sinfo = cache->space_info;
+ u64 num_bytes;
+ u64 sinfo_used;
+ u64 min_allocable_bytes;
+ int ret = -ENOSPC;
+
+ /*
+ * We need some metadata space and system metadata space for
+ * allocating chunks in some corner cases until we force to set
+ * it to be readonly.
+ */
+ if ((sinfo->flags &
+ (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
+ !force)
+ min_allocable_bytes = SZ_1M;
+ else
+ min_allocable_bytes = 0;
+
+ spin_lock(&sinfo->lock);
+ spin_lock(&cache->lock);
+
+ if (cache->ro) {
+ cache->ro++;
+ ret = 0;
+ goto out;
+ }
+
+ num_bytes = cache->length - cache->reserved - cache->pinned -
+ cache->bytes_super - cache->used;
+ sinfo_used = btrfs_space_info_used(sinfo, true);
+
+ /*
+ * sinfo_used + num_bytes should always <= sinfo->total_bytes.
+ *
+ * Here we make sure if we mark this bg RO, we still have enough
+ * free space as buffer (if min_allocable_bytes is not 0).
+ */
+ if (sinfo_used + num_bytes + min_allocable_bytes <=
+ sinfo->total_bytes) {
+ sinfo->bytes_readonly += num_bytes;
+ cache->ro++;
+ list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
+ ret = 0;
+ }
+out:
+ spin_unlock(&cache->lock);
+ spin_unlock(&sinfo->lock);
+ if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
+ btrfs_info(cache->fs_info,
+ "unable to make block group %llu ro", cache->start);
+ btrfs_info(cache->fs_info,
+ "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
+ sinfo_used, num_bytes, min_allocable_bytes);
+ btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+ }
+ return ret;
+}
+
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+ return;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->unused_bgs)) {
+ u64 start, end;
+ int trimming;
+
+ block_group = list_first_entry(&fs_info->unused_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+
+ space_info = block_group->space_info;
+
+ if (ret || btrfs_mixed_space_info(space_info)) {
+ btrfs_put_block_group(block_group);
+ continue;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
+
+ /* Don't want to race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+ spin_lock(&block_group->lock);
+ if (block_group->reserved || block_group->pinned ||
+ block_group->used || block_group->ro ||
+ list_is_singular(&block_group->list)) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ trace_btrfs_skip_unused_block_group(block_group);
+ spin_unlock(&block_group->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&block_group->lock);
+
+ /* We don't want to force the issue, only flip if it's ok. */
+ ret = inc_block_group_ro(block_group, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0) {
+ ret = 0;
+ goto next;
+ }
+
+ /*
+ * Want to do this before we do anything else so we can recover
+ * properly if we fail to join the transaction.
+ */
+ trans = btrfs_start_trans_remove_block_group(fs_info,
+ block_group->start);
+ if (IS_ERR(trans)) {
+ btrfs_dec_block_group_ro(block_group);
+ ret = PTR_ERR(trans);
+ goto next;
+ }
+
+ /*
+ * We could have pending pinned extents for this block group,
+ * just delete them, we don't care about them anymore.
+ */
+ start = block_group->start;
+ end = start + block_group->length - 1;
+ /*
+ * Hold the unused_bg_unpin_mutex lock to avoid racing with
+ * btrfs_finish_extent_commit(). If we are at transaction N,
+ * another task might be running finish_extent_commit() for the
+ * previous transaction N - 1, and have seen a range belonging
+ * to the block group in freed_extents[] before we were able to
+ * clear the whole block group range from freed_extents[]. This
+ * means that task can lookup for the block group after we
+ * unpinned it from freed_extents[] and removed it, leading to
+ * a BUG_ON() at btrfs_unpin_extent_range().
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
+ EXTENT_DIRTY);
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_dec_block_group_ro(block_group);
+ goto end_trans;
+ }
+ ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
+ EXTENT_DIRTY);
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_dec_block_group_ro(block_group);
+ goto end_trans;
+ }
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+
+ /* Reset pinned so btrfs_put_block_group doesn't complain */
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+
+ btrfs_space_info_update_bytes_pinned(fs_info, space_info,
+ -block_group->pinned);
+ space_info->bytes_readonly += block_group->pinned;
+ percpu_counter_add_batch(&space_info->total_bytes_pinned,
+ -block_group->pinned,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ block_group->pinned = 0;
+
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
+ /* DISCARD can flip during remount */
+ trimming = btrfs_test_opt(fs_info, DISCARD);
+
+ /* Implicit trim during transaction commit. */
+ if (trimming)
+ btrfs_get_block_group_trimming(block_group);
+
+ /*
+ * Btrfs_remove_chunk will abort the transaction if things go
+ * horribly wrong.
+ */
+ ret = btrfs_remove_chunk(trans, block_group->start);
+
+ if (ret) {
+ if (trimming)
+ btrfs_put_block_group_trimming(block_group);
+ goto end_trans;
+ }
+
+ /*
+ * If we're not mounted with -odiscard, we can just forget
+ * about this block group. Otherwise we'll need to wait
+ * until transaction commit to do the actual discard.
+ */
+ if (trimming) {
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * A concurrent scrub might have added us to the list
+ * fs_info->unused_bgs, so use a list_move operation
+ * to add the block group to the deleted_bgs list.
+ */
+ list_move(&block_group->bg_list,
+ &trans->transaction->deleted_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
+ btrfs_get_block_group(block_group);
+ }
+end_trans:
+ btrfs_end_transaction(trans);
+next:
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ btrfs_put_block_group(block_group);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ trace_btrfs_add_unused_block_group(bg);
+ list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+static int find_first_block_group(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_key *key)
+{
+ struct btrfs_root *root = fs_info->extent_root;
+ int ret = 0;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ struct btrfs_block_group_item bg;
+ u64 flags;
+ int slot;
+
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (1) {
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+ break;
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.objectid >= key->objectid &&
+ found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ em_tree = &root->fs_info->mapping_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, found_key.objectid,
+ found_key.offset);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ btrfs_err(fs_info,
+ "logical %llu len %llu found bg but no related chunk",
+ found_key.objectid, found_key.offset);
+ ret = -ENOENT;
+ } else if (em->start != found_key.objectid ||
+ em->len != found_key.offset) {
+ btrfs_err(fs_info,
+ "block group %llu len %llu mismatch with chunk %llu len %llu",
+ found_key.objectid, found_key.offset,
+ em->start, em->len);
+ ret = -EUCLEAN;
+ } else {
+ read_extent_buffer(leaf, &bg,
+ btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bg));
+ flags = btrfs_stack_block_group_flags(&bg) &
+ BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ if (flags != (em->map_lookup->type &
+ BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
+ found_key.objectid,
+ found_key.offset, flags,
+ (BTRFS_BLOCK_GROUP_TYPE_MASK &
+ em->map_lookup->type));
+ ret = -EUCLEAN;
+ } else {
+ ret = 0;
+ }
+ }
+ free_extent_map(em);
+ goto out;
+ }
+ path->slots[0]++;
+ }
+out:
+ return ret;
+}
+
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ write_seqlock(&fs_info->profiles_lock);
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits |= extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
+}
+
+static int exclude_super_stripes(struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ u64 bytenr;
+ u64 *logical;
+ int stripe_len;
+ int i, nr, ret;
+
+ if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
+ stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
+ cache->bytes_super += stripe_len;
+ ret = btrfs_add_excluded_extent(fs_info, cache->start,
+ stripe_len);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ ret = btrfs_rmap_block(fs_info, cache->start,
+ bytenr, &logical, &nr, &stripe_len);
+ if (ret)
+ return ret;
+
+ while (nr--) {
+ u64 start, len;
+
+ if (logical[nr] > cache->start + cache->length)
+ continue;
+
+ if (logical[nr] + stripe_len <= cache->start)
+ continue;
+
+ start = logical[nr];
+ if (start < cache->start) {
+ start = cache->start;
+ len = (logical[nr] + stripe_len) - start;
+ } else {
+ len = min_t(u64, stripe_len,
+ cache->start + cache->length - start);
+ }
+
+ cache->bytes_super += len;
+ ret = btrfs_add_excluded_extent(fs_info, start, len);
+ if (ret) {
+ kfree(logical);
+ return ret;
+ }
+ }
+
+ kfree(logical);
+ }
+ return 0;
+}
+
+static void link_block_group(struct btrfs_block_group *cache)
+{
+ struct btrfs_space_info *space_info = cache->space_info;
+ int index = btrfs_bg_flags_to_raid_index(cache->flags);
+ bool first = false;
+
+ down_write(&space_info->groups_sem);
+ if (list_empty(&space_info->block_groups[index]))
+ first = true;
+ list_add_tail(&cache->list, &space_info->block_groups[index]);
+ up_write(&space_info->groups_sem);
+
+ if (first)
+ btrfs_sysfs_add_block_group_type(cache);
+}
+
+static struct btrfs_block_group *btrfs_create_block_group_cache(
+ struct btrfs_fs_info *fs_info, u64 start, u64 size)
+{
+ struct btrfs_block_group *cache;
+
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache)
+ return NULL;
+
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_NOFS);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->start = start;
+ cache->length = size;
+
+ cache->fs_info = fs_info;
+ cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
+ set_free_space_tree_thresholds(cache);
+
+ atomic_set(&cache->count, 1);
+ spin_lock_init(&cache->lock);
+ init_rwsem(&cache->data_rwsem);
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->bg_list);
+ INIT_LIST_HEAD(&cache->ro_list);
+ INIT_LIST_HEAD(&cache->dirty_list);
+ INIT_LIST_HEAD(&cache->io_list);
+ btrfs_init_free_space_ctl(cache);
+ atomic_set(&cache->trimming, 0);
+ mutex_init(&cache->free_space_lock);
+ btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
+
+ return cache;
+}
+
+/*
+ * Iterate all chunks and verify that each of them has the corresponding block
+ * group
+ */
+static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
+{
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct btrfs_block_group *bg;
+ u64 start = 0;
+ int ret = 0;
+
+ while (1) {
+ read_lock(&map_tree->lock);
+ /*
+ * lookup_extent_mapping will return the first extent map
+ * intersecting the range, so setting @len to 1 is enough to
+ * get the first chunk.
+ */
+ em = lookup_extent_mapping(map_tree, start, 1);
+ read_unlock(&map_tree->lock);
+ if (!em)
+ break;
+
+ bg = btrfs_lookup_block_group(fs_info, em->start);
+ if (!bg) {
+ btrfs_err(fs_info,
+ "chunk start=%llu len=%llu doesn't have corresponding block group",
+ em->start, em->len);
+ ret = -EUCLEAN;
+ free_extent_map(em);
+ break;
+ }
+ if (bg->start != em->start || bg->length != em->len ||
+ (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+ (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(fs_info,
+"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
+ em->start, em->len,
+ em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+ bg->start, bg->length,
+ bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+ ret = -EUCLEAN;
+ free_extent_map(em);
+ btrfs_put_block_group(bg);
+ break;
+ }
+ start = em->start + em->len;
+ free_extent_map(em);
+ btrfs_put_block_group(bg);
+ }
+ return ret;
+}
+
+static int read_one_block_group(struct btrfs_fs_info *info,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ int need_clear)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_block_group *cache;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_group_item bgi;
+ const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
+ int slot = path->slots[0];
+ int ret;
+
+ ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
+
+ cache = btrfs_create_block_group_cache(info, key->objectid, key->offset);
+ if (!cache)
+ return -ENOMEM;
+
+ if (need_clear) {
+ /*
+ * When we mount with old space cache, we need to
+ * set BTRFS_DC_CLEAR and set dirty flag.
+ *
+ * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+ * truncate the old free space cache inode and
+ * setup a new one.
+ * b) Setting 'dirty flag' makes sure that we flush
+ * the new space cache info onto disk.
+ */
+ if (btrfs_test_opt(info, SPACE_CACHE))
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
+ }
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+ cache->used = btrfs_stack_block_group_used(&bgi);
+ cache->flags = btrfs_stack_block_group_flags(&bgi);
+ if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+ (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
+ btrfs_err(info,
+"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
+ cache->start);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ /*
+ * We need to exclude the super stripes now so that the space info has
+ * super bytes accounted for, otherwise we'll think we have more space
+ * than we actually do.
+ */
+ ret = exclude_super_stripes(cache);
+ if (ret) {
+ /* We may have excluded something, so call this just in case. */
+ btrfs_free_excluded_extents(cache);
+ goto error;
+ }
+
+ /*
+ * Check for two cases, either we are full, and therefore don't need
+ * to bother with the caching work since we won't find any space, or we
+ * are empty, and we can just add all the space in and be done with it.
+ * This saves us _a_lot_ of time, particularly in the full case.
+ */
+ if (key->offset == cache->used) {
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ btrfs_free_excluded_extents(cache);
+ } else if (cache->used == 0) {
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ add_new_free_space(cache, key->objectid,
+ key->objectid + key->offset);
+ btrfs_free_excluded_extents(cache);
+ }
+
+ ret = btrfs_add_block_group_cache(info, cache);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ goto error;
+ }
+ trace_btrfs_add_block_group(info, cache, 0);
+ btrfs_update_space_info(info, cache->flags, key->offset,
+ cache->used, cache->bytes_super, &space_info);
+
+ cache->space_info = space_info;
+
+ link_block_group(cache);
+
+ set_avail_alloc_bits(info, cache->flags);
+ if (btrfs_chunk_readonly(info, cache->start)) {
+ inc_block_group_ro(cache, 1);
+ } else if (cache->used == 0) {
+ ASSERT(list_empty(&cache->bg_list));
+ btrfs_mark_bg_unused(cache);
+ }
+ return 0;
+error:
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+int btrfs_read_block_groups(struct btrfs_fs_info *info)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct btrfs_block_group *cache;
+ struct btrfs_space_info *space_info;
+ struct btrfs_key key;
+ int need_clear = 0;
+ u64 cache_gen;
+
+ key.objectid = 0;
+ key.offset = 0;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_FORWARD;
+
+ cache_gen = btrfs_super_cache_generation(info->super_copy);
+ if (btrfs_test_opt(info, SPACE_CACHE) &&
+ btrfs_super_generation(info->super_copy) != cache_gen)
+ need_clear = 1;
+ if (btrfs_test_opt(info, CLEAR_CACHE))
+ need_clear = 1;
+
+ while (1) {
+ ret = find_first_block_group(info, path, &key);
+ if (ret > 0)
+ break;
+ if (ret != 0)
+ goto error;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ret = read_one_block_group(info, path, &key, need_clear);
+ if (ret < 0)
+ goto error;
+ key.objectid += key.offset;
+ key.offset = 0;
+ btrfs_release_path(path);
+ }
+
+ list_for_each_entry_rcu(space_info, &info->space_info, list) {
+ if (!(btrfs_get_alloc_profile(info, space_info->flags) &
+ (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID1_MASK |
+ BTRFS_BLOCK_GROUP_RAID56_MASK |
+ BTRFS_BLOCK_GROUP_DUP)))
+ continue;
+ /*
+ * Avoid allocating from un-mirrored block group if there are
+ * mirrored block groups.
+ */
+ list_for_each_entry(cache,
+ &space_info->block_groups[BTRFS_RAID_RAID0],
+ list)
+ inc_block_group_ro(cache, 1);
+ list_for_each_entry(cache,
+ &space_info->block_groups[BTRFS_RAID_SINGLE],
+ list)
+ inc_block_group_ro(cache, 1);
+ }
+
+ btrfs_init_global_block_rsv(info);
+ ret = check_chunk_block_group_mappings(info);
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *block_group;
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_block_group_item item;
+ struct btrfs_key key;
+ int ret = 0;
+
+ if (!trans->can_flush_pending_bgs)
+ return;
+
+ while (!list_empty(&trans->new_bgs)) {
+ block_group = list_first_entry(&trans->new_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ if (ret)
+ goto next;
+
+ spin_lock(&block_group->lock);
+ btrfs_set_stack_block_group_used(&item, block_group->used);
+ btrfs_set_stack_block_group_chunk_objectid(&item,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ btrfs_set_stack_block_group_flags(&item, block_group->flags);
+ key.objectid = block_group->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = block_group->length;
+ spin_unlock(&block_group->lock);
+
+ ret = btrfs_insert_item(trans, extent_root, &key, &item,
+ sizeof(item));
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ add_block_group_free_space(trans, block_group);
+ /* Already aborted the transaction if it failed. */
+next:
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+ list_del_init(&block_group->bg_list);
+ }
+ btrfs_trans_release_chunk_metadata(trans);
+}
+
+int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
+ u64 type, u64 chunk_offset, u64 size)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *cache;
+ int ret;
+
+ btrfs_set_log_full_commit(trans);
+
+ cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
+ if (!cache)
+ return -ENOMEM;
+
+ cache->used = bytes_used;
+ cache->flags = type;
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ cache->needs_free_space = 1;
+ ret = exclude_super_stripes(cache);
+ if (ret) {
+ /* We may have excluded something, so call this just in case */
+ btrfs_free_excluded_extents(cache);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+
+ add_new_free_space(cache, chunk_offset, chunk_offset + size);
+
+ btrfs_free_excluded_extents(cache);
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ bytes_used += new_bytes_used >> 1;
+ fragment_free_space(cache);
+ }
+#endif
+ /*
+ * Ensure the corresponding space_info object is created and
+ * assigned to our block group. We want our bg to be added to the rbtree
+ * with its ->space_info set.
+ */
+ cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
+ ASSERT(cache->space_info);
+
+ ret = btrfs_add_block_group_cache(fs_info, cache);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+
+ /*
+ * Now that our block group has its ->space_info set and is inserted in
+ * the rbtree, update the space info's counters.
+ */
+ trace_btrfs_add_block_group(fs_info, cache, 1);
+ btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
+ cache->bytes_super, &cache->space_info);
+ btrfs_update_global_block_rsv(fs_info);
+
+ link_block_group(cache);
+
+ list_add_tail(&cache->bg_list, &trans->new_bgs);
+ trans->delayed_ref_updates++;
+ btrfs_update_delayed_refs_rsv(trans);
+
+ set_avail_alloc_bits(fs_info, type);
+ return 0;
+}
+
+static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 num_devices;
+ u64 stripped;
+
+ /*
+ * if restripe for this chunk_type is on pick target profile and
+ * return, otherwise do the usual balance
+ */
+ stripped = get_restripe_target(fs_info, flags);
+ if (stripped)
+ return extended_to_chunk(stripped);
+
+ num_devices = fs_info->fs_devices->rw_devices;
+
+ stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
+ BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
+
+ if (num_devices == 1) {
+ stripped |= BTRFS_BLOCK_GROUP_DUP;
+ stripped = flags & ~stripped;
+
+ /* turn raid0 into single device chunks */
+ if (flags & BTRFS_BLOCK_GROUP_RAID0)
+ return stripped;
+
+ /* turn mirroring into duplication */
+ if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
+ BTRFS_BLOCK_GROUP_RAID10))
+ return stripped | BTRFS_BLOCK_GROUP_DUP;
+ } else {
+ /* they already had raid on here, just return */
+ if (flags & stripped)
+ return flags;
+
+ stripped |= BTRFS_BLOCK_GROUP_DUP;
+ stripped = flags & ~stripped;
+
+ /* switch duplicated blocks with raid1 */
+ if (flags & BTRFS_BLOCK_GROUP_DUP)
+ return stripped | BTRFS_BLOCK_GROUP_RAID1;
+
+ /* this is drive concat, leave it alone */
+ }
+
+ return flags;
+}
+
+/*
+ * Mark one block group RO, can be called several times for the same block
+ * group.
+ *
+ * @cache: the destination block group
+ * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
+ * ensure we still have some free space after marking this
+ * block group RO.
+ */
+int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
+ bool do_chunk_alloc)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_trans_handle *trans;
+ u64 alloc_flags;
+ int ret;
+
+again:
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /*
+ * we're not allowed to set block groups readonly after the dirty
+ * block groups cache has started writing. If it already started,
+ * back off and let this transaction commit
+ */
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
+ u64 transid = trans->transid;
+
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+ btrfs_end_transaction(trans);
+
+ ret = btrfs_wait_for_commit(fs_info, transid);
+ if (ret)
+ return ret;
+ goto again;
+ }
+
+ if (do_chunk_alloc) {
+ /*
+ * If we are changing raid levels, try to allocate a
+ * corresponding block group with the new raid level.
+ */
+ alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ if (alloc_flags != cache->flags) {
+ ret = btrfs_chunk_alloc(trans, alloc_flags,
+ CHUNK_ALLOC_FORCE);
+ /*
+ * ENOSPC is allowed here, we may have enough space
+ * already allocated at the new raid level to carry on
+ */
+ if (ret == -ENOSPC)
+ ret = 0;
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ ret = inc_block_group_ro(cache, !do_chunk_alloc);
+ if (!do_chunk_alloc)
+ goto unlock_out;
+ if (!ret)
+ goto out;
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
+ ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ if (ret < 0)
+ goto out;
+ ret = inc_block_group_ro(cache, 0);
+out:
+ if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+ alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ mutex_lock(&fs_info->chunk_mutex);
+ check_system_chunk(trans, alloc_flags);
+ mutex_unlock(&fs_info->chunk_mutex);
+ }
+unlock_out:
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
+{
+ struct btrfs_space_info *sinfo = cache->space_info;
+ u64 num_bytes;
+
+ BUG_ON(!cache->ro);
+
+ spin_lock(&sinfo->lock);
+ spin_lock(&cache->lock);
+ if (!--cache->ro) {
+ num_bytes = cache->length - cache->reserved -
+ cache->pinned - cache->bytes_super - cache->used;
+ sinfo->bytes_readonly -= num_bytes;
+ list_del_init(&cache->ro_list);
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&sinfo->lock);
+}
+
+static int write_one_cache_group(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ unsigned long bi;
+ struct extent_buffer *leaf;
+ struct btrfs_block_group_item bgi;
+ struct btrfs_key key;
+
+ key.objectid = cache->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = cache->length;
+
+ ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ leaf = path->nodes[0];
+ bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ btrfs_set_stack_block_group_used(&bgi, cache->used);
+ btrfs_set_stack_block_group_chunk_objectid(&bgi,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ btrfs_set_stack_block_group_flags(&bgi, cache->flags);
+ write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
+ btrfs_mark_buffer_dirty(leaf);
+fail:
+ btrfs_release_path(path);
+ return ret;
+
+}
+
+static int cache_save_setup(struct btrfs_block_group *block_group,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_root *root = fs_info->tree_root;
+ struct inode *inode = NULL;
+ struct extent_changeset *data_reserved = NULL;
+ u64 alloc_hint = 0;
+ int dcs = BTRFS_DC_ERROR;
+ u64 num_pages = 0;
+ int retries = 0;
+ int ret = 0;
+
+ /*
+ * If this block group is smaller than 100 megs don't bother caching the
+ * block group.
+ */
+ if (block_group->length < (100 * SZ_1M)) {
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+
+ if (trans->aborted)
+ return 0;
+again:
+ inode = lookup_free_space_inode(block_group, path);
+ if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+ ret = PTR_ERR(inode);
+ btrfs_release_path(path);
+ goto out;
+ }
+
+ if (IS_ERR(inode)) {
+ BUG_ON(retries);
+ retries++;
+
+ if (block_group->ro)
+ goto out_free;
+
+ ret = create_free_space_inode(trans, block_group, path);
+ if (ret)
+ goto out_free;
+ goto again;
+ }
+
+ /*
+ * We want to set the generation to 0, that way if anything goes wrong
+ * from here on out we know not to trust this cache when we load up next
+ * time.
+ */
+ BTRFS_I(inode)->generation = 0;
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ /*
+ * So theoretically we could recover from this, simply set the
+ * super cache generation to 0 so we know to invalidate the
+ * cache, but then we'd have to keep track of the block groups
+ * that fail this way so we know we _have_ to reset this cache
+ * before the next commit or risk reading stale cache. So to
+ * limit our exposure to horrible edge cases lets just abort the
+ * transaction, this only happens in really bad situations
+ * anyway.
+ */
+ btrfs_abort_transaction(trans, ret);
+ goto out_put;
+ }
+ WARN_ON(ret);
+
+ /* We've already setup this transaction, go ahead and exit */
+ if (block_group->cache_generation == trans->transid &&
+ i_size_read(inode)) {
+ dcs = BTRFS_DC_SETUP;
+ goto out_put;
+ }
+
+ if (i_size_read(inode) > 0) {
+ ret = btrfs_check_trunc_cache_free_space(fs_info,
+ &fs_info->global_block_rsv);
+ if (ret)
+ goto out_put;
+
+ ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
+ if (ret)
+ goto out_put;
+ }
+
+ spin_lock(&block_group->lock);
+ if (block_group->cached != BTRFS_CACHE_FINISHED ||
+ !btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ /*
+ * don't bother trying to write stuff out _if_
+ * a) we're not cached,
+ * b) we're with nospace_cache mount option,
+ * c) we're with v2 space_cache (FREE_SPACE_TREE).
+ */
+ dcs = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ goto out_put;
+ }
+ spin_unlock(&block_group->lock);
+
+ /*
+ * We hit an ENOSPC when setting up the cache in this transaction, just
+ * skip doing the setup, we've already cleared the cache so we're safe.
+ */
+ if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+ ret = -ENOSPC;
+ goto out_put;
+ }
+
+ /*
+ * Try to preallocate enough space based on how big the block group is.
+ * Keep in mind this has to include any pinned space which could end up
+ * taking up quite a bit since it's not folded into the other space
+ * cache.
+ */
+ num_pages = div_u64(block_group->length, SZ_256M);
+ if (!num_pages)
+ num_pages = 1;
+
+ num_pages *= 16;
+ num_pages *= PAGE_SIZE;
+
+ ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
+ if (ret)
+ goto out_put;
+
+ ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
+ num_pages, num_pages,
+ &alloc_hint);
+ /*
+ * Our cache requires contiguous chunks so that we don't modify a bunch
+ * of metadata or split extents when writing the cache out, which means
+ * we can enospc if we are heavily fragmented in addition to just normal
+ * out of space conditions. So if we hit this just skip setting up any
+ * other block groups for this transaction, maybe we'll unpin enough
+ * space the next time around.
+ */
+ if (!ret)
+ dcs = BTRFS_DC_SETUP;
+ else if (ret == -ENOSPC)
+ set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+
+out_put:
+ iput(inode);
+out_free:
+ btrfs_release_path(path);
+out:
+ spin_lock(&block_group->lock);
+ if (!ret && dcs == BTRFS_DC_SETUP)
+ block_group->cache_generation = trans->transid;
+ block_group->disk_cache_state = dcs;
+ spin_unlock(&block_group->lock);
+
+ extent_changeset_free(data_reserved);
+ return ret;
+}
+
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *cache, *tmp;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_path *path;
+
+ if (list_empty(&cur_trans->dirty_bgs) ||
+ !btrfs_test_opt(fs_info, SPACE_CACHE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* Could add new block groups, use _safe just in case */
+ list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+ dirty_list) {
+ if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+ cache_save_setup(cache, trans, path);
+ }
+
+ btrfs_free_path(path);
+ return 0;
+}
+
+/*
+ * Transaction commit does final block group cache writeback during a critical
+ * section where nothing is allowed to change the FS. This is required in
+ * order for the cache to actually match the block group, but can introduce a
+ * lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
+ * There's a chance we'll have to redo some of it if the block group changes
+ * again during the commit, but it greatly reduces the commit latency by
+ * getting rid of the easy block groups while we're still allowing others to
+ * join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *cache;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int ret = 0;
+ int should_put;
+ struct btrfs_path *path = NULL;
+ LIST_HEAD(dirty);
+ struct list_head *io = &cur_trans->io_bgs;
+ int num_started = 0;
+ int loops = 0;
+
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ if (list_empty(&cur_trans->dirty_bgs)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ return 0;
+ }
+ list_splice_init(&cur_trans->dirty_bgs, &dirty);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+again:
+ /* Make sure all the block groups on our dirty list actually exist */
+ btrfs_create_pending_block_groups(trans);
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
+
+ /*
+ * cache_write_mutex is here only to save us from balance or automatic
+ * removal of empty block groups deleting this block group while we are
+ * writing out the cache
+ */
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ while (!list_empty(&dirty)) {
+ bool drop_reserve = true;
+
+ cache = list_first_entry(&dirty, struct btrfs_block_group,
+ dirty_list);
+ /*
+ * This can happen if something re-dirties a block group that
+ * is already under IO. Just wait for it to finish and then do
+ * it all again
+ */
+ if (!list_empty(&cache->io_list)) {
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(trans, cache, path);
+ btrfs_put_block_group(cache);
+ }
+
+
+ /*
+ * btrfs_wait_cache_io uses the cache->dirty_list to decide if
+ * it should update the cache_state. Don't delete until after
+ * we wait.
+ *
+ * Since we're not running in the commit critical section
+ * we need the dirty_bgs_lock to protect from update_block_group
+ */
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_del_init(&cache->dirty_list);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+ should_put = 1;
+
+ cache_save_setup(cache, trans, path);
+
+ if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+ cache->io_ctl.inode = NULL;
+ ret = btrfs_write_out_cache(trans, cache, path);
+ if (ret == 0 && cache->io_ctl.inode) {
+ num_started++;
+ should_put = 0;
+
+ /*
+ * The cache_write_mutex is protecting the
+ * io_list, also refer to the definition of
+ * btrfs_transaction::io_bgs for more details
+ */
+ list_add_tail(&cache->io_list, io);
+ } else {
+ /*
+ * If we failed to write the cache, the
+ * generation will be bad and life goes on
+ */
+ ret = 0;
+ }
+ }
+ if (!ret) {
+ ret = write_one_cache_group(trans, path, cache);
+ /*
+ * Our block group might still be attached to the list
+ * of new block groups in the transaction handle of some
+ * other task (struct btrfs_trans_handle->new_bgs). This
+ * means its block group item isn't yet in the extent
+ * tree. If this happens ignore the error, as we will
+ * try again later in the critical section of the
+ * transaction commit.
+ */
+ if (ret == -ENOENT) {
+ ret = 0;
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &cur_trans->dirty_bgs);
+ btrfs_get_block_group(cache);
+ drop_reserve = false;
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ } else if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ }
+ }
+
+ /* If it's not on the io list, we need to put the block group */
+ if (should_put)
+ btrfs_put_block_group(cache);
+ if (drop_reserve)
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+
+ if (ret)
+ break;
+
+ /*
+ * Avoid blocking other tasks for too long. It might even save
+ * us from writing caches for block groups that are going to be
+ * removed.
+ */
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ }
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
+ /*
+ * Go through delayed refs for all the stuff we've just kicked off
+ * and then loop back (just once)
+ */
+ ret = btrfs_run_delayed_refs(trans, 0);
+ if (!ret && loops == 0) {
+ loops++;
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_splice_init(&cur_trans->dirty_bgs, &dirty);
+ /*
+ * dirty_bgs_lock protects us from concurrent block group
+ * deletes too (not just cache_write_mutex).
+ */
+ if (!list_empty(&dirty)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ goto again;
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ } else if (ret < 0) {
+ btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *cache;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int ret = 0;
+ int should_put;
+ struct btrfs_path *path;
+ struct list_head *io = &cur_trans->io_bgs;
+ int num_started = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Even though we are in the critical section of the transaction commit,
+ * we can still have concurrent tasks adding elements to this
+ * transaction's list of dirty block groups. These tasks correspond to
+ * endio free space workers started when writeback finishes for a
+ * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
+ * allocate new block groups as a result of COWing nodes of the root
+ * tree when updating the free space inode. The writeback for the space
+ * caches is triggered by an earlier call to
+ * btrfs_start_dirty_block_groups() and iterations of the following
+ * loop.
+ * Also we want to do the cache_save_setup first and then run the
+ * delayed refs to make sure we have the best chance at doing this all
+ * in one shot.
+ */
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ while (!list_empty(&cur_trans->dirty_bgs)) {
+ cache = list_first_entry(&cur_trans->dirty_bgs,
+ struct btrfs_block_group,
+ dirty_list);
+
+ /*
+ * This can happen if cache_save_setup re-dirties a block group
+ * that is already under IO. Just wait for it to finish and
+ * then do it all again
+ */
+ if (!list_empty(&cache->io_list)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(trans, cache, path);
+ btrfs_put_block_group(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ }
+
+ /*
+ * Don't remove from the dirty list until after we've waited on
+ * any pending IO
+ */
+ list_del_init(&cache->dirty_list);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ should_put = 1;
+
+ cache_save_setup(cache, trans, path);
+
+ if (!ret)
+ ret = btrfs_run_delayed_refs(trans,
+ (unsigned long) -1);
+
+ if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
+ cache->io_ctl.inode = NULL;
+ ret = btrfs_write_out_cache(trans, cache, path);
+ if (ret == 0 && cache->io_ctl.inode) {
+ num_started++;
+ should_put = 0;
+ list_add_tail(&cache->io_list, io);
+ } else {
+ /*
+ * If we failed to write the cache, the
+ * generation will be bad and life goes on
+ */
+ ret = 0;
+ }
+ }
+ if (!ret) {
+ ret = write_one_cache_group(trans, path, cache);
+ /*
+ * One of the free space endio workers might have
+ * created a new block group while updating a free space
+ * cache's inode (at inode.c:btrfs_finish_ordered_io())
+ * and hasn't released its transaction handle yet, in
+ * which case the new block group is still attached to
+ * its transaction handle and its creation has not
+ * finished yet (no block group item in the extent tree
+ * yet, etc). If this is the case, wait for all free
+ * space endio workers to finish and retry. This is a
+ * a very rare case so no need for a more efficient and
+ * complex approach.
+ */
+ if (ret == -ENOENT) {
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+ ret = write_one_cache_group(trans, path, cache);
+ }
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
+
+ /* If its not on the io list, we need to put the block group */
+ if (should_put)
+ btrfs_put_block_group(cache);
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+ /*
+ * Refer to the definition of io_bgs member for details why it's safe
+ * to use it without any locking
+ */
+ while (!list_empty(io)) {
+ cache = list_first_entry(io, struct btrfs_block_group,
+ io_list);
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(trans, cache, path);
+ btrfs_put_block_group(cache);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, int alloc)
+{
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct btrfs_block_group *cache = NULL;
+ u64 total = num_bytes;
+ u64 old_val;
+ u64 byte_in_group;
+ int factor;
+ int ret = 0;
+
+ /* Block accounting for super block */
+ spin_lock(&info->delalloc_root_lock);
+ old_val = btrfs_super_bytes_used(info->super_copy);
+ if (alloc)
+ old_val += num_bytes;
+ else
+ old_val -= num_bytes;
+ btrfs_set_super_bytes_used(info->super_copy, old_val);
+ spin_unlock(&info->delalloc_root_lock);
+
+ while (total) {
+ cache = btrfs_lookup_block_group(info, bytenr);
+ if (!cache) {
+ ret = -ENOENT;
+ break;
+ }
+ factor = btrfs_bg_type_to_factor(cache->flags);
+
+ /*
+ * If this block group has free space cache written out, we
+ * need to make sure to load it if we are removing space. This
+ * is because we need the unpinning stage to actually add the
+ * space back to the block group, otherwise we will leak space.
+ */
+ if (!alloc && !btrfs_block_group_done(cache))
+ btrfs_cache_block_group(cache, 1);
+
+ byte_in_group = bytenr - cache->start;
+ WARN_ON(byte_in_group > cache->length);
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+
+ if (btrfs_test_opt(info, SPACE_CACHE) &&
+ cache->disk_cache_state < BTRFS_DC_CLEAR)
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
+
+ old_val = cache->used;
+ num_bytes = min(total, cache->length - byte_in_group);
+ if (alloc) {
+ old_val += num_bytes;
+ cache->used = old_val;
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
+ cache->space_info->bytes_used += num_bytes;
+ cache->space_info->disk_used += num_bytes * factor;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ } else {
+ old_val -= num_bytes;
+ cache->used = old_val;
+ cache->pinned += num_bytes;
+ btrfs_space_info_update_bytes_pinned(info,
+ cache->space_info, num_bytes);
+ cache->space_info->bytes_used -= num_bytes;
+ cache->space_info->disk_used -= num_bytes * factor;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ percpu_counter_add_batch(
+ &cache->space_info->total_bytes_pinned,
+ num_bytes,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ set_extent_dirty(info->pinned_extents,
+ bytenr, bytenr + num_bytes - 1,
+ GFP_NOFS | __GFP_NOFAIL);
+ }
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &trans->transaction->dirty_bgs);
+ trans->delayed_ref_updates++;
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ /*
+ * No longer have used bytes in this block group, queue it for
+ * deletion. We do this after adding the block group to the
+ * dirty list to avoid races between cleaner kthread and space
+ * cache writeout.
+ */
+ if (!alloc && old_val == 0)
+ btrfs_mark_bg_unused(cache);
+
+ btrfs_put_block_group(cache);
+ total -= num_bytes;
+ bytenr += num_bytes;
+ }
+
+ /* Modified block groups are accounted for in the delayed_refs_rsv. */
+ btrfs_update_delayed_refs_rsv(trans);
+ return ret;
+}
+
+/**
+ * btrfs_add_reserved_bytes - update the block_group and space info counters
+ * @cache: The cache we are manipulating
+ * @ram_bytes: The number of bytes of file content, and will be same to
+ * @num_bytes except for the compress path.
+ * @num_bytes: The number of bytes in question
+ * @delalloc: The blocks are allocated for the delalloc write
+ *
+ * This is called by the allocator when it reserves space. If this is a
+ * reservation and the block group has become read only we cannot make the
+ * reservation and return -EAGAIN, otherwise this function always succeeds.
+ */
+int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
+ u64 ram_bytes, u64 num_bytes, int delalloc)
+{
+ struct btrfs_space_info *space_info = cache->space_info;
+ int ret = 0;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ if (cache->ro) {
+ ret = -EAGAIN;
+ } else {
+ cache->reserved += num_bytes;
+ space_info->bytes_reserved += num_bytes;
+ trace_btrfs_space_reservation(cache->fs_info, "space_info",
+ space_info->flags, num_bytes, 1);
+ btrfs_space_info_update_bytes_may_use(cache->fs_info,
+ space_info, -ram_bytes);
+ if (delalloc)
+ cache->delalloc_bytes += num_bytes;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
+ return ret;
+}
+
+/**
+ * btrfs_free_reserved_bytes - update the block_group and space info counters
+ * @cache: The cache we are manipulating
+ * @num_bytes: The number of bytes in question
+ * @delalloc: The blocks are allocated for the delalloc write
+ *
+ * This is called by somebody who is freeing space that was never actually used
+ * on disk. For example if you reserve some space for a new leaf in transaction
+ * A and before transaction A commits you free that leaf, you call this with
+ * reserve set to 0 in order to clear the reservation.
+ */
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
+ u64 num_bytes, int delalloc)
+{
+ struct btrfs_space_info *space_info = cache->space_info;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ if (cache->ro)
+ space_info->bytes_readonly += num_bytes;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+ space_info->max_extent_size = 0;
+
+ if (delalloc)
+ cache->delalloc_bytes -= num_bytes;
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
+}
+
+static void force_metadata_allocation(struct btrfs_fs_info *info)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list) {
+ if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+ found->force_alloc = CHUNK_ALLOC_FORCE;
+ }
+ rcu_read_unlock();
+}
+
+static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *sinfo, int force)
+{
+ u64 bytes_used = btrfs_space_info_used(sinfo, false);
+ u64 thresh;
+
+ if (force == CHUNK_ALLOC_FORCE)
+ return 1;
+
+ /*
+ * in limited mode, we want to have some free space up to
+ * about 1% of the FS size.
+ */
+ if (force == CHUNK_ALLOC_LIMITED) {
+ thresh = btrfs_super_total_bytes(fs_info->super_copy);
+ thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
+
+ if (sinfo->total_bytes - bytes_used < thresh)
+ return 1;
+ }
+
+ if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
+ return 0;
+ return 1;
+}
+
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
+{
+ u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
+
+ return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+}
+
+/*
+ * If force is CHUNK_ALLOC_FORCE:
+ * - return 1 if it successfully allocates a chunk,
+ * - return errors including -ENOSPC otherwise.
+ * If force is NOT CHUNK_ALLOC_FORCE:
+ * - return 0 if it doesn't need to allocate a new chunk,
+ * - return 1 if it successfully allocates a chunk,
+ * - return errors including -ENOSPC otherwise.
+ */
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+ enum btrfs_chunk_alloc_enum force)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_space_info *space_info;
+ bool wait_for_alloc = false;
+ bool should_alloc = false;
+ int ret = 0;
+
+ /* Don't re-enter if we're already allocating a chunk */
+ if (trans->allocating_chunk)
+ return -ENOSPC;
+
+ space_info = btrfs_find_space_info(fs_info, flags);
+ ASSERT(space_info);
+
+ do {
+ spin_lock(&space_info->lock);
+ if (force < space_info->force_alloc)
+ force = space_info->force_alloc;
+ should_alloc = should_alloc_chunk(fs_info, space_info, force);
+ if (space_info->full) {
+ /* No more free physical space */
+ if (should_alloc)
+ ret = -ENOSPC;
+ else
+ ret = 0;
+ spin_unlock(&space_info->lock);
+ return ret;
+ } else if (!should_alloc) {
+ spin_unlock(&space_info->lock);
+ return 0;
+ } else if (space_info->chunk_alloc) {
+ /*
+ * Someone is already allocating, so we need to block
+ * until this someone is finished and then loop to
+ * recheck if we should continue with our allocation
+ * attempt.
+ */
+ wait_for_alloc = true;
+ spin_unlock(&space_info->lock);
+ mutex_lock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
+ } else {
+ /* Proceed with allocation */
+ space_info->chunk_alloc = 1;
+ wait_for_alloc = false;
+ spin_unlock(&space_info->lock);
+ }
+
+ cond_resched();
+ } while (wait_for_alloc);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ trans->allocating_chunk = true;
+
+ /*
+ * If we have mixed data/metadata chunks we want to make sure we keep
+ * allocating mixed chunks instead of individual chunks.
+ */
+ if (btrfs_mixed_space_info(space_info))
+ flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
+
+ /*
+ * if we're doing a data chunk, go ahead and make sure that
+ * we keep a reasonable number of metadata chunks allocated in the
+ * FS as well.
+ */
+ if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
+ fs_info->data_chunk_allocations++;
+ if (!(fs_info->data_chunk_allocations %
+ fs_info->metadata_ratio))
+ force_metadata_allocation(fs_info);
+ }
+
+ /*
+ * Check if we have enough space in SYSTEM chunk because we may need
+ * to update devices.
+ */
+ check_system_chunk(trans, flags);
+
+ ret = btrfs_alloc_chunk(trans, flags);
+ trans->allocating_chunk = false;
+
+ spin_lock(&space_info->lock);
+ if (ret < 0) {
+ if (ret == -ENOSPC)
+ space_info->full = 1;
+ else
+ goto out;
+ } else {
+ ret = 1;
+ space_info->max_extent_size = 0;
+ }
+
+ space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+out:
+ space_info->chunk_alloc = 0;
+ spin_unlock(&space_info->lock);
+ mutex_unlock(&fs_info->chunk_mutex);
+ /*
+ * When we allocate a new chunk we reserve space in the chunk block
+ * reserve to make sure we can COW nodes/leafs in the chunk tree or
+ * add new nodes/leafs to it if we end up needing to do it when
+ * inserting the chunk item and updating device items as part of the
+ * second phase of chunk allocation, performed by
+ * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
+ * large number of new block groups to create in our transaction
+ * handle's new_bgs list to avoid exhausting the chunk block reserve
+ * in extreme cases - like having a single transaction create many new
+ * block groups when starting to write out the free space caches of all
+ * the block groups that were made dirty during the lifetime of the
+ * transaction.
+ */
+ if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
+ btrfs_create_pending_block_groups(trans);
+
+ return ret;
+}
+
+static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
+{
+ u64 num_dev;
+
+ num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
+ if (!num_dev)
+ num_dev = fs_info->fs_devices->rw_devices;
+
+ return num_dev;
+}
+
+/*
+ * Reserve space in the system space for allocating or removing a chunk
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_space_info *info;
+ u64 left;
+ u64 thresh;
+ int ret = 0;
+ u64 num_devs;
+
+ /*
+ * Needed because we can end up allocating a system chunk and for an
+ * atomic and race free space reservation in the chunk block reserve.
+ */
+ lockdep_assert_held(&fs_info->chunk_mutex);
+
+ info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ spin_lock(&info->lock);
+ left = info->total_bytes - btrfs_space_info_used(info, true);
+ spin_unlock(&info->lock);
+
+ num_devs = get_profile_num_devs(fs_info, type);
+
+ /* num_devs device items to update and 1 chunk item to add or remove */
+ thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
+ btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
+ left, thresh, type);
+ btrfs_dump_space_info(fs_info, info, 0, 0);
+ }
+
+ if (left < thresh) {
+ u64 flags = btrfs_system_alloc_profile(fs_info);
+
+ /*
+ * Ignore failure to create system chunk. We might end up not
+ * needing it, as we might not need to COW all nodes/leafs from
+ * the paths we visit in the chunk tree (they were already COWed
+ * or created in the current transaction for example).
+ */
+ ret = btrfs_alloc_chunk(trans, flags);
+ }
+
+ if (!ret) {
+ ret = btrfs_block_rsv_add(fs_info->chunk_root,
+ &fs_info->chunk_block_rsv,
+ thresh, BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ trans->chunk_bytes_reserved += thresh;
+ }
+}
+
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+{
+ struct btrfs_block_group *block_group;
+ u64 last = 0;
+
+ while (1) {
+ struct inode *inode;
+
+ block_group = btrfs_lookup_first_block_group(info, last);
+ while (block_group) {
+ btrfs_wait_block_group_cache_done(block_group);
+ spin_lock(&block_group->lock);
+ if (block_group->iref)
+ break;
+ spin_unlock(&block_group->lock);
+ block_group = btrfs_next_block_group(block_group);
+ }
+ if (!block_group) {
+ if (last == 0)
+ break;
+ last = 0;
+ continue;
+ }
+
+ inode = block_group->inode;
+ block_group->iref = 0;
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+ ASSERT(block_group->io_ctl.inode == NULL);
+ iput(inode);
+ last = block_group->start + block_group->length;
+ btrfs_put_block_group(block_group);
+ }
+}
+
+/*
+ * Must be called only after stopping all workers, since we could have block
+ * group caching kthreads running, and therefore they could race with us if we
+ * freed the block groups before stopping them.
+ */
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct rb_node *n;
+
+ down_write(&info->commit_root_sem);
+ while (!list_empty(&info->caching_block_groups)) {
+ caching_ctl = list_entry(info->caching_block_groups.next,
+ struct btrfs_caching_control, list);
+ list_del(&caching_ctl->list);
+ btrfs_put_caching_control(caching_ctl);
+ }
+ up_write(&info->commit_root_sem);
+
+ spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->unused_bgs)) {
+ block_group = list_first_entry(&info->unused_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+
+ spin_lock(&info->block_group_cache_lock);
+ while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ block_group = rb_entry(n, struct btrfs_block_group,
+ cache_node);
+ rb_erase(&block_group->cache_node,
+ &info->block_group_cache_tree);
+ RB_CLEAR_NODE(&block_group->cache_node);
+ spin_unlock(&info->block_group_cache_lock);
+
+ down_write(&block_group->space_info->groups_sem);
+ list_del(&block_group->list);
+ up_write(&block_group->space_info->groups_sem);
+
+ /*
+ * We haven't cached this block group, which means we could
+ * possibly have excluded extents on this block group.
+ */
+ if (block_group->cached == BTRFS_CACHE_NO ||
+ block_group->cached == BTRFS_CACHE_ERROR)
+ btrfs_free_excluded_extents(block_group);
+
+ btrfs_remove_free_space_cache(block_group);
+ ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
+ ASSERT(list_empty(&block_group->dirty_list));
+ ASSERT(list_empty(&block_group->io_list));
+ ASSERT(list_empty(&block_group->bg_list));
+ ASSERT(atomic_read(&block_group->count) == 1);
+ btrfs_put_block_group(block_group);
+
+ spin_lock(&info->block_group_cache_lock);
+ }
+ spin_unlock(&info->block_group_cache_lock);
+
+ /*
+ * Now that all the block groups are freed, go through and free all the
+ * space_info structs. This is only called during the final stages of
+ * unmount, and so we know nobody is using them. We call
+ * synchronize_rcu() once before we start, just to be on the safe side.
+ */
+ synchronize_rcu();
+
+ btrfs_release_global_block_rsv(info);
+
+ while (!list_empty(&info->space_info)) {
+ space_info = list_entry(info->space_info.next,
+ struct btrfs_space_info,
+ list);
+
+ /*
+ * Do not hide this behind enospc_debug, this is actually
+ * important and indicates a real bug if this happens.
+ */
+ if (WARN_ON(space_info->bytes_pinned > 0 ||
+ space_info->bytes_reserved > 0 ||
+ space_info->bytes_may_use > 0))
+ btrfs_dump_space_info(info, space_info, 0, 0);
+ list_del(&space_info->list);
+ btrfs_sysfs_remove_space_info(space_info);
+ }
+ return 0;
+}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
new file mode 100644
index 000000000000..9b409676c4b2
--- /dev/null
+++ b/fs/btrfs/block-group.h
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_BLOCK_GROUP_H
+#define BTRFS_BLOCK_GROUP_H
+
+#include "free-space-cache.h"
+
+enum btrfs_disk_cache_state {
+ BTRFS_DC_WRITTEN,
+ BTRFS_DC_ERROR,
+ BTRFS_DC_CLEAR,
+ BTRFS_DC_SETUP,
+};
+
+/*
+ * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to
+ * only allocate a chunk if we really need one.
+ *
+ * CHUNK_ALLOC_LIMITED means to only try and allocate one if we have very few
+ * chunks already allocated. This is used as part of the clustering code to
+ * help make sure we have a good pool of storage to cluster in, without filling
+ * the FS with empty chunks
+ *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ */
+enum btrfs_chunk_alloc_enum {
+ CHUNK_ALLOC_NO_FORCE,
+ CHUNK_ALLOC_LIMITED,
+ CHUNK_ALLOC_FORCE,
+};
+
+struct btrfs_caching_control {
+ struct list_head list;
+ struct mutex mutex;
+ wait_queue_head_t wait;
+ struct btrfs_work work;
+ struct btrfs_block_group *block_group;
+ u64 progress;
+ refcount_t count;
+};
+
+/* Once caching_thread() finds this much free space, it will wake up waiters. */
+#define CACHING_CTL_WAKE_UP SZ_2M
+
+struct btrfs_block_group {
+ struct btrfs_fs_info *fs_info;
+ struct inode *inode;
+ spinlock_t lock;
+ u64 start;
+ u64 length;
+ u64 pinned;
+ u64 reserved;
+ u64 used;
+ u64 delalloc_bytes;
+ u64 bytes_super;
+ u64 flags;
+ u64 cache_generation;
+
+ /*
+ * If the free space extent count exceeds this number, convert the block
+ * group to bitmaps.
+ */
+ u32 bitmap_high_thresh;
+
+ /*
+ * If the free space extent count drops below this number, convert the
+ * block group back to extents.
+ */
+ u32 bitmap_low_thresh;
+
+ /*
+ * It is just used for the delayed data space allocation because
+ * only the data space allocation and the relative metadata update
+ * can be done cross the transaction.
+ */
+ struct rw_semaphore data_rwsem;
+
+ /* For raid56, this is a full stripe, without parity */
+ unsigned long full_stripe_len;
+
+ unsigned int ro;
+ unsigned int iref:1;
+ unsigned int has_caching_ctl:1;
+ unsigned int removed:1;
+
+ int disk_cache_state;
+
+ /* Cache tracking stuff */
+ int cached;
+ struct btrfs_caching_control *caching_ctl;
+ u64 last_byte_to_unpin;
+
+ struct btrfs_space_info *space_info;
+
+ /* Free space cache stuff */
+ struct btrfs_free_space_ctl *free_space_ctl;
+
+ /* Block group cache stuff */
+ struct rb_node cache_node;
+
+ /* For block groups in the same raid type */
+ struct list_head list;
+
+ /* Usage count */
+ atomic_t count;
+
+ /*
+ * List of struct btrfs_free_clusters for this block group.
+ * Today it will only have one thing on it, but that may change
+ */
+ struct list_head cluster_list;
+
+ /* For delayed block group creation or deletion of empty block groups */
+ struct list_head bg_list;
+
+ /* For read-only block groups */
+ struct list_head ro_list;
+
+ atomic_t trimming;
+
+ /* For dirty block groups */
+ struct list_head dirty_list;
+ struct list_head io_list;
+
+ struct btrfs_io_ctl io_ctl;
+
+ /*
+ * Incremented when doing extent allocations and holding a read lock
+ * on the space_info's groups_sem semaphore.
+ * Decremented when an ordered extent that represents an IO against this
+ * block group's range is created (after it's added to its inode's
+ * root's list of ordered extents) or immediately after the allocation
+ * if it's a metadata extent or fallocate extent (for these cases we
+ * don't create ordered extents).
+ */
+ atomic_t reservations;
+
+ /*
+ * Incremented while holding the spinlock *lock* by a task checking if
+ * it can perform a nocow write (incremented if the value for the *ro*
+ * field is 0). Decremented by such tasks once they create an ordered
+ * extent or before that if some error happens before reaching that step.
+ * This is to prevent races between block group relocation and nocow
+ * writes through direct IO.
+ */
+ atomic_t nocow_writers;
+
+ /* Lock for free space tree operations. */
+ struct mutex free_space_lock;
+
+ /*
+ * Does the block group need to be added to the free space tree?
+ * Protected by free_space_lock.
+ */
+ int needs_free_space;
+
+ /* Record locked full stripes for RAID5/6 block group */
+ struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
+};
+
+#ifdef CONFIG_BTRFS_DEBUG
+static inline int btrfs_should_fragment_free_space(
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+
+ return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
+
+struct btrfs_block_group *btrfs_lookup_first_block_group(
+ struct btrfs_fs_info *info, u64 bytenr);
+struct btrfs_block_group *btrfs_lookup_block_group(
+ struct btrfs_fs_info *info, u64 bytenr);
+struct btrfs_block_group *btrfs_next_block_group(
+ struct btrfs_block_group *cache);
+void btrfs_get_block_group(struct btrfs_block_group *cache);
+void btrfs_put_block_group(struct btrfs_block_group *cache);
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+ const u64 start);
+void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
+bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
+void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
+ u64 num_bytes);
+int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache);
+int btrfs_cache_block_group(struct btrfs_block_group *cache,
+ int load_cache_only);
+void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
+struct btrfs_caching_control *btrfs_get_caching_control(
+ struct btrfs_block_group *cache);
+u64 add_new_free_space(struct btrfs_block_group *block_group,
+ u64 start, u64 end);
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+ struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ u64 group_start, struct extent_map *em);
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
+int btrfs_read_block_groups(struct btrfs_fs_info *info);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
+ u64 type, u64 chunk_offset, u64 size);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
+int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
+ bool do_chunk_alloc);
+void btrfs_dec_block_group_ro(struct btrfs_block_group *cache);
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
+int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, int alloc);
+int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
+ u64 ram_bytes, u64 num_bytes, int delalloc);
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
+ u64 num_bytes, int delalloc);
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+ enum btrfs_chunk_alloc_enum force);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
+void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+
+static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
+}
+
+static inline u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+}
+
+static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+}
+
+static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
+{
+ smp_mb();
+ return cache->cached == BTRFS_CACHE_FINISHED ||
+ cache->cached == BTRFS_CACHE_ERROR;
+}
+
+#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
new file mode 100644
index 000000000000..d07bd41a7c1e
--- /dev/null
+++ b/fs/btrfs/block-rsv.c
@@ -0,0 +1,437 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "misc.h"
+#include "ctree.h"
+#include "block-rsv.h"
+#include "space-info.h"
+#include "transaction.h"
+
+static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ u64 *qgroup_to_release_ret)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+ u64 qgroup_to_release = 0;
+ u64 ret;
+
+ spin_lock(&block_rsv->lock);
+ if (num_bytes == (u64)-1) {
+ num_bytes = block_rsv->size;
+ qgroup_to_release = block_rsv->qgroup_rsv_size;
+ }
+ block_rsv->size -= num_bytes;
+ if (block_rsv->reserved >= block_rsv->size) {
+ num_bytes = block_rsv->reserved - block_rsv->size;
+ block_rsv->reserved = block_rsv->size;
+ block_rsv->full = 1;
+ } else {
+ num_bytes = 0;
+ }
+ if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
+ qgroup_to_release = block_rsv->qgroup_rsv_reserved -
+ block_rsv->qgroup_rsv_size;
+ block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
+ } else {
+ qgroup_to_release = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+
+ ret = num_bytes;
+ if (num_bytes > 0) {
+ if (dest) {
+ spin_lock(&dest->lock);
+ if (!dest->full) {
+ u64 bytes_to_add;
+
+ bytes_to_add = dest->size - dest->reserved;
+ bytes_to_add = min(num_bytes, bytes_to_add);
+ dest->reserved += bytes_to_add;
+ if (dest->reserved >= dest->size)
+ dest->full = 1;
+ num_bytes -= bytes_to_add;
+ }
+ spin_unlock(&dest->lock);
+ }
+ if (num_bytes)
+ btrfs_space_info_free_bytes_may_use(fs_info,
+ space_info,
+ num_bytes);
+ }
+ if (qgroup_to_release_ret)
+ *qgroup_to_release_ret = qgroup_to_release;
+ return ret;
+}
+
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
+ struct btrfs_block_rsv *dst, u64 num_bytes,
+ bool update_size)
+{
+ int ret;
+
+ ret = btrfs_block_rsv_use_bytes(src, num_bytes);
+ if (ret)
+ return ret;
+
+ btrfs_block_rsv_add_bytes(dst, num_bytes, update_size);
+ return 0;
+}
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
+{
+ memset(rsv, 0, sizeof(*rsv));
+ spin_lock_init(&rsv->lock);
+ rsv->type = type;
+}
+
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv,
+ unsigned short type)
+{
+ btrfs_init_block_rsv(rsv, type);
+ rsv->space_info = btrfs_find_space_info(fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+}
+
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
+ unsigned short type)
+{
+ struct btrfs_block_rsv *block_rsv;
+
+ block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+ if (!block_rsv)
+ return NULL;
+
+ btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
+ return block_rsv;
+}
+
+void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv)
+{
+ if (!rsv)
+ return;
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
+ kfree(rsv);
+}
+
+int btrfs_block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ int ret;
+
+ if (num_bytes == 0)
+ return 0;
+
+ ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ if (!ret)
+ btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
+
+ return ret;
+}
+
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
+{
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ if (!block_rsv)
+ return 0;
+
+ spin_lock(&block_rsv->lock);
+ num_bytes = div_factor(block_rsv->size, min_factor);
+ if (block_rsv->reserved >= num_bytes)
+ ret = 0;
+ spin_unlock(&block_rsv->lock);
+
+ return ret;
+}
+
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush)
+{
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ if (!block_rsv)
+ return 0;
+
+ spin_lock(&block_rsv->lock);
+ num_bytes = min_reserved;
+ if (block_rsv->reserved >= num_bytes)
+ ret = 0;
+ else
+ num_bytes -= block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+
+ if (!ret)
+ return 0;
+
+ ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ if (!ret) {
+ btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
+ return 0;
+ }
+
+ return ret;
+}
+
+u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, u64 *qgroup_to_release)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *target = NULL;
+
+ /*
+ * If we are the delayed_rsv then push to the global rsv, otherwise dump
+ * into the delayed rsv if it is not full.
+ */
+ if (block_rsv == delayed_rsv)
+ target = global_rsv;
+ else if (block_rsv != global_rsv && !delayed_rsv->full)
+ target = delayed_rsv;
+
+ if (target && block_rsv->space_info != target->space_info)
+ target = NULL;
+
+ return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
+ qgroup_to_release);
+}
+
+int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
+{
+ int ret = -ENOSPC;
+
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved >= num_bytes) {
+ block_rsv->reserved -= num_bytes;
+ if (block_rsv->reserved < block_rsv->size)
+ block_rsv->full = 0;
+ ret = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+ return ret;
+}
+
+void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, bool update_size)
+{
+ spin_lock(&block_rsv->lock);
+ block_rsv->reserved += num_bytes;
+ if (update_size)
+ block_rsv->size += num_bytes;
+ else if (block_rsv->reserved >= block_rsv->size)
+ block_rsv->full = 1;
+ spin_unlock(&block_rsv->lock);
+}
+
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ int min_factor)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 min_bytes;
+
+ if (global_rsv->space_info != dest->space_info)
+ return -ENOSPC;
+
+ spin_lock(&global_rsv->lock);
+ min_bytes = div_factor(global_rsv->size, min_factor);
+ if (global_rsv->reserved < min_bytes + num_bytes) {
+ spin_unlock(&global_rsv->lock);
+ return -ENOSPC;
+ }
+ global_rsv->reserved -= num_bytes;
+ if (global_rsv->reserved < global_rsv->size)
+ global_rsv->full = 0;
+ spin_unlock(&global_rsv->lock);
+
+ btrfs_block_rsv_add_bytes(dest, num_bytes, true);
+ return 0;
+}
+
+void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ struct btrfs_space_info *sinfo = block_rsv->space_info;
+ u64 num_bytes;
+ unsigned min_items;
+
+ /*
+ * The global block rsv is based on the size of the extent tree, the
+ * checksum tree and the root tree. If the fs is empty we want to set
+ * it to a minimal amount for safety.
+ */
+ num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
+ btrfs_root_used(&fs_info->csum_root->root_item) +
+ btrfs_root_used(&fs_info->tree_root->root_item);
+
+ /*
+ * We at a minimum are going to modify the csum root, the tree root, and
+ * the extent root.
+ */
+ min_items = 3;
+
+ /*
+ * But we also want to reserve enough space so we can do the fallback
+ * global reserve for an unlink, which is an additional 5 items (see the
+ * comment in __unlink_start_trans for what we're modifying.)
+ *
+ * But we also need space for the delayed ref updates from the unlink,
+ * so its 10, 5 for the actual operation, and 5 for the delayed ref
+ * updates.
+ */
+ min_items += 10;
+
+ num_bytes = max_t(u64, num_bytes,
+ btrfs_calc_insert_metadata_size(fs_info, min_items));
+
+ spin_lock(&sinfo->lock);
+ spin_lock(&block_rsv->lock);
+
+ block_rsv->size = min_t(u64, num_bytes, SZ_512M);
+
+ if (block_rsv->reserved < block_rsv->size) {
+ num_bytes = block_rsv->size - block_rsv->reserved;
+ block_rsv->reserved += num_bytes;
+ btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
+ num_bytes);
+ } else if (block_rsv->reserved > block_rsv->size) {
+ num_bytes = block_rsv->reserved - block_rsv->size;
+ btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
+ -num_bytes);
+ block_rsv->reserved = block_rsv->size;
+ btrfs_try_granting_tickets(fs_info, sinfo);
+ }
+
+ if (block_rsv->reserved == block_rsv->size)
+ block_rsv->full = 1;
+ else
+ block_rsv->full = 0;
+
+ spin_unlock(&block_rsv->lock);
+ spin_unlock(&sinfo->lock);
+}
+
+void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ fs_info->chunk_block_rsv.space_info = space_info;
+
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ fs_info->global_block_rsv.space_info = space_info;
+ fs_info->trans_block_rsv.space_info = space_info;
+ fs_info->empty_block_rsv.space_info = space_info;
+ fs_info->delayed_block_rsv.space_info = space_info;
+ fs_info->delayed_refs_rsv.space_info = space_info;
+
+ fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
+ fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
+ fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+ if (fs_info->quota_root)
+ fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+
+ btrfs_update_global_block_rsv(fs_info);
+}
+
+void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1);
+ WARN_ON(fs_info->trans_block_rsv.size > 0);
+ WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+ WARN_ON(fs_info->chunk_block_rsv.size > 0);
+ WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+ WARN_ON(fs_info->delayed_block_rsv.size > 0);
+ WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
+ WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
+ WARN_ON(fs_info->delayed_refs_rsv.size > 0);
+}
+
+static struct btrfs_block_rsv *get_block_rsv(
+ const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *block_rsv = NULL;
+
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ (root == fs_info->csum_root && trans->adding_csums) ||
+ (root == fs_info->uuid_root))
+ block_rsv = trans->block_rsv;
+
+ if (!block_rsv)
+ block_rsv = root->block_rsv;
+
+ if (!block_rsv)
+ block_rsv = &fs_info->empty_block_rsv;
+
+ return block_rsv;
+}
+
+struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u32 blocksize)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ int ret;
+ bool global_updated = false;
+
+ block_rsv = get_block_rsv(trans, root);
+
+ if (unlikely(block_rsv->size == 0))
+ goto try_reserve;
+again:
+ ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
+ if (!ret)
+ return block_rsv;
+
+ if (block_rsv->failfast)
+ return ERR_PTR(ret);
+
+ if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
+ global_updated = true;
+ btrfs_update_global_block_rsv(fs_info);
+ goto again;
+ }
+
+ /*
+ * The global reserve still exists to save us from ourselves, so don't
+ * warn_on if we are short on our delayed refs reserve.
+ */
+ if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
+ btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ static DEFINE_RATELIMIT_STATE(_rs,
+ DEFAULT_RATELIMIT_INTERVAL * 10,
+ /*DEFAULT_RATELIMIT_BURST*/ 1);
+ if (__ratelimit(&_rs))
+ WARN(1, KERN_DEBUG
+ "BTRFS: block rsv returned %d\n", ret);
+ }
+try_reserve:
+ ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ return block_rsv;
+ /*
+ * If we couldn't reserve metadata bytes try and use some from
+ * the global reserve if its space type is the same as the global
+ * reservation.
+ */
+ if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
+ block_rsv->space_info == global_rsv->space_info) {
+ ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
+ if (!ret)
+ return global_rsv;
+ }
+ return ERR_PTR(ret);
+}
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
new file mode 100644
index 000000000000..d1428bb73fc5
--- /dev/null
+++ b/fs/btrfs/block-rsv.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_BLOCK_RSV_H
+#define BTRFS_BLOCK_RSV_H
+
+struct btrfs_trans_handle;
+enum btrfs_reserve_flush_enum;
+
+/*
+ * Types of block reserves
+ */
+enum {
+ BTRFS_BLOCK_RSV_GLOBAL,
+ BTRFS_BLOCK_RSV_DELALLOC,
+ BTRFS_BLOCK_RSV_TRANS,
+ BTRFS_BLOCK_RSV_CHUNK,
+ BTRFS_BLOCK_RSV_DELOPS,
+ BTRFS_BLOCK_RSV_DELREFS,
+ BTRFS_BLOCK_RSV_EMPTY,
+ BTRFS_BLOCK_RSV_TEMP,
+};
+
+struct btrfs_block_rsv {
+ u64 size;
+ u64 reserved;
+ struct btrfs_space_info *space_info;
+ spinlock_t lock;
+ unsigned short full;
+ unsigned short type;
+ unsigned short failfast;
+
+ /*
+ * Qgroup equivalent for @size @reserved
+ *
+ * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care
+ * about things like csum size nor how many tree blocks it will need to
+ * reserve.
+ *
+ * Qgroup cares more about net change of the extent usage.
+ *
+ * So for one newly inserted file extent, in worst case it will cause
+ * leaf split and level increase, nodesize for each file extent is
+ * already too much.
+ *
+ * In short, qgroup_size/reserved is the upper limit of possible needed
+ * qgroup metadata reservation.
+ */
+ u64 qgroup_rsv_size;
+ u64 qgroup_rsv_reserved;
+};
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
+ unsigned short type);
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv,
+ unsigned short type);
+void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush);
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+ struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
+ bool update_size);
+int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ int min_factor);
+void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, bool update_size);
+u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, u64 *qgroup_to_release);
+void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info);
+void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info);
+void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);
+struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u32 blocksize);
+
+static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
+}
+
+static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ u32 blocksize)
+{
+ btrfs_block_rsv_add_bytes(block_rsv, blocksize, false);
+ btrfs_block_rsv_release(fs_info, block_rsv, 0);
+}
+
+#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6f5d07415dab..4e12a477d32e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -63,9 +63,6 @@ struct btrfs_inode {
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
- /* held while doing delalloc reservations */
- struct mutex delalloc_mutex;
-
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
@@ -148,12 +145,6 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
- * Track the transaction id of the last transaction used to create a
- * hard link for the inode. This is used by the log tree (fsync).
- */
- u64 last_link_trans;
-
- /*
* Number of bytes outstanding that are going to need csums. This is
* used in ENOSPC accounting.
*/
@@ -203,8 +194,6 @@ struct btrfs_inode {
struct inode vfs_inode;
};
-extern unsigned char btrfs_filetype_table[];
-
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -345,22 +334,34 @@ static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
}
+/* Array of bytes with variable length, hexadecimal format 0x1234 */
+#define CSUM_FMT "0x%*phN"
+#define CSUM_FMT_VALUE(size, bytes) size, bytes
+
static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
- u64 logical_start, u32 csum, u32 csum_expected, int mirror_num)
+ u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
{
struct btrfs_root *root = inode->root;
+ struct btrfs_super_block *sb = root->fs_info->super_copy;
+ const u16 csum_size = btrfs_super_csum_size(sb);
/* Output minus objectid, which is more meaningful */
if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
btrfs_warn_rl(root->fs_info,
- "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d",
+"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
root->root_key.objectid, btrfs_ino(inode),
- logical_start, csum, csum_expected, mirror_num);
+ logical_start,
+ CSUM_FMT_VALUE(csum_size, csum),
+ CSUM_FMT_VALUE(csum_size, csum_expected),
+ mirror_num);
else
btrfs_warn_rl(root->fs_info,
- "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d",
+"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
root->root_key.objectid, btrfs_ino(inode),
- logical_start, csum, csum_expected, mirror_num);
+ logical_start,
+ CSUM_FMT_VALUE(csum_size, csum),
+ CSUM_FMT_VALUE(csum_size, csum_expected),
+ mirror_num);
}
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index b0c8094528d1..0b52ab4cb964 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -83,7 +83,7 @@
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/string.h>
-#include <linux/crc32c.h>
+#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -940,7 +940,7 @@ static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
kfree(sf);
}
-static int btrfsic_process_metablock(
+static noinline_for_stack int btrfsic_process_metablock(
struct btrfsic_state *state,
struct btrfsic_block *const first_block,
struct btrfsic_block_data_ctx *const first_block_ctx,
@@ -1706,13 +1706,14 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
* Test whether the disk block contains a tree block (leaf or node)
* (note that this test fails for the super block)
*/
-static int btrfsic_test_for_metadata(struct btrfsic_state *state,
- char **datav, unsigned int num_pages)
+static noinline_for_stack int btrfsic_test_for_metadata(
+ struct btrfsic_state *state,
+ char **datav, unsigned int num_pages)
{
struct btrfs_fs_info *fs_info = state->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_header *h;
u8 csum[BTRFS_CSUM_SIZE];
- u32 crc = ~(u32)0;
unsigned int i;
if (num_pages * PAGE_SIZE < state->metablock_size)
@@ -1723,14 +1724,17 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))
return 1;
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+
for (i = 0; i < num_pages; i++) {
u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
size_t sublen = i ? PAGE_SIZE :
(PAGE_SIZE - BTRFS_CSUM_SIZE);
- crc = crc32c(crc, data, sublen);
+ crypto_shash_update(shash, data, sublen);
}
- btrfs_csum_final(crc, csum);
+ crypto_shash_final(shash, csum);
if (memcmp(csum, h->csum, state->csum_size))
return 1;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 4f2a8ae0aa42..43e1660f450f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -17,6 +17,8 @@
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
+#include <crypto/hash.h>
+#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -27,6 +29,41 @@
#include "extent_io.h"
#include "extent_map.h"
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *zlib_alloc_workspace(unsigned int level);
+void zlib_free_workspace(struct list_head *ws);
+struct list_head *zlib_get_workspace(unsigned int level);
+
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *lzo_alloc_workspace(unsigned int level);
+void lzo_free_workspace(struct list_head *ws);
+
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+void zstd_init_workspace_manager(void);
+void zstd_cleanup_workspace_manager(void);
+struct list_head *zstd_alloc_workspace(unsigned int level);
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(unsigned int level);
+void zstd_put_workspace(struct list_head *ws);
+
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
@@ -37,11 +74,93 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
case BTRFS_COMPRESS_ZSTD:
case BTRFS_COMPRESS_NONE:
return btrfs_compress_types[type];
+ default:
+ break;
}
return NULL;
}
+bool btrfs_compress_is_valid_type(const char *str, size_t len)
+{
+ int i;
+
+ for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
+ size_t comp_len = strlen(btrfs_compress_types[i]);
+
+ if (len < comp_len)
+ continue;
+
+ if (!strncmp(btrfs_compress_types[i], str, comp_len))
+ return true;
+ }
+ return false;
+}
+
+static int compression_compress_pages(int type, struct list_head *ws,
+ struct address_space *mapping, u64 start, struct page **pages,
+ unsigned long *out_pages, unsigned long *total_in,
+ unsigned long *total_out)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB:
+ return zlib_compress_pages(ws, mapping, start, pages,
+ out_pages, total_in, total_out);
+ case BTRFS_COMPRESS_LZO:
+ return lzo_compress_pages(ws, mapping, start, pages,
+ out_pages, total_in, total_out);
+ case BTRFS_COMPRESS_ZSTD:
+ return zstd_compress_pages(ws, mapping, start, pages,
+ out_pages, total_in, total_out);
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here. As a sane fallback, return what the
+ * callers will understand as 'no compression happened'.
+ */
+ return -E2BIG;
+ }
+}
+
+static int compression_decompress_bio(int type, struct list_head *ws,
+ struct compressed_bio *cb)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
+ case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
+ case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+static int compression_decompress(int type, struct list_head *ws,
+ unsigned char *data_in, struct page *dest_page,
+ unsigned long start_byte, size_t srclen, size_t destlen)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
+ start_byte, srclen, destlen);
+ case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
+ start_byte, srclen, destlen);
+ case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
+ start_byte, srclen, destlen);
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
@@ -57,32 +176,37 @@ static int check_compressed_csum(struct btrfs_inode *inode,
struct compressed_bio *cb,
u64 disk_start)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int ret;
struct page *page;
unsigned long i;
char *kaddr;
- u32 csum;
- u32 *cb_sum = &cb->sums;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u8 *cb_sum = cb->sums;
if (inode->flags & BTRFS_INODE_NODATASUM)
return 0;
+ shash->tfm = fs_info->csum_shash;
+
for (i = 0; i < cb->nr_pages; i++) {
page = cb->compressed_pages[i];
- csum = ~(u32)0;
+ crypto_shash_init(shash);
kaddr = kmap_atomic(page);
- csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE);
- btrfs_csum_final(csum, (u8 *)&csum);
+ crypto_shash_update(shash, kaddr, PAGE_SIZE);
kunmap_atomic(kaddr);
+ crypto_shash_final(shash, (u8 *)&csum);
- if (csum != *cb_sum) {
- btrfs_print_data_csum_error(inode, disk_start, csum,
- *cb_sum, cb->mirror_num);
+ if (memcmp(&csum, cb_sum, csum_size)) {
+ btrfs_print_data_csum_error(inode, disk_start,
+ csum, cb_sum, cb->mirror_num);
ret = -EIO;
goto fail;
}
- cb_sum++;
+ cb_sum += csum_size;
}
ret = 0;
@@ -160,7 +284,6 @@ csum_failed:
if (cb->errors) {
bio_io_error(cb->orig_bio);
} else {
- int i;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
@@ -169,7 +292,7 @@ csum_failed:
* checked so the end_io handlers know about it
*/
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all)
+ bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
SetPageChecked(bvec->bv_page);
bio_endio(cb->orig_bio);
@@ -251,7 +374,7 @@ static void end_compressed_bio_write(struct bio *bio)
cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
cb->start, cb->start + cb->len - 1,
- bio->bi_status ? BLK_STS_OK : BLK_STS_NOTSUPP);
+ bio->bi_status == BLK_STS_OK);
cb->compressed_pages[0]->mapping = NULL;
end_compressed_writeback(inode, cb);
@@ -289,7 +412,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long compressed_len,
struct page **compressed_pages,
unsigned long nr_pages,
- unsigned int write_flags)
+ unsigned int write_flags,
+ struct cgroup_subsys_state *blkcg_css)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct bio *bio = NULL;
@@ -298,7 +422,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
int pg_index = 0;
struct page *page;
u64 first_byte = disk_start;
- struct block_device *bdev;
blk_status_t ret;
int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -317,12 +440,15 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
- bdev = fs_info->fs_devices->latest_bdev;
-
- bio = btrfs_bio_alloc(bdev, first_byte);
+ bio = btrfs_bio_alloc(first_byte);
bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
+
+ if (blkcg_css) {
+ bio->bi_opf |= REQ_CGROUP_PUNT;
+ kthread_associate_blkcg(blkcg_css);
+ }
refcount_set(&cb->pending_bios, 1);
/* create and submit bios for the compressed pages */
@@ -355,16 +481,18 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(fs_info, bio, 0, 1);
+ ret = btrfs_map_bio(fs_info, bio, 0);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
}
- bio = btrfs_bio_alloc(bdev, first_byte);
+ bio = btrfs_bio_alloc(first_byte);
bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
+ if (blkcg_css)
+ bio->bi_opf |= REQ_CGROUP_PUNT;
bio_add_page(bio, page, PAGE_SIZE, 0);
}
if (bytes_left < PAGE_SIZE) {
@@ -385,12 +513,15 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(fs_info, bio, 0, 1);
+ ret = btrfs_map_bio(fs_info, bio, 0);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
}
+ if (blkcg_css)
+ kthread_associate_blkcg(NULL);
+
return 0;
}
@@ -529,7 +660,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
unsigned long nr_pages;
unsigned long pg_index;
struct page *page;
- struct block_device *bdev;
struct bio *comp_bio;
u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
u64 em_len;
@@ -537,7 +667,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
struct extent_map *em;
blk_status_t ret = BLK_STS_RESOURCE;
int faili = 0;
- u32 *sums;
+ const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ u8 *sums;
em_tree = &BTRFS_I(inode)->extent_tree;
@@ -559,7 +690,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
cb->errors = 0;
cb->inode = inode;
cb->mirror_num = mirror_num;
- sums = &cb->sums;
+ sums = cb->sums;
cb->start = em->orig_start;
em_len = em->len;
@@ -579,8 +710,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!cb->compressed_pages)
goto fail1;
- bdev = fs_info->fs_devices->latest_bdev;
-
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
__GFP_HIGHMEM);
@@ -598,7 +727,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
- comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
+ comp_bio = btrfs_bio_alloc(cur_disk_byte);
comp_bio->bi_opf = REQ_OP_READ;
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
@@ -618,6 +747,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
page->mapping = NULL;
if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
PAGE_SIZE) {
+ unsigned int nr_sectors;
+
ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
@@ -635,16 +766,18 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
sums);
BUG_ON(ret); /* -ENOMEM */
}
- sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
- fs_info->sectorsize);
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
+ nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
+ fs_info->sectorsize);
+ sums += csum_size * nr_sectors;
+
+ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret) {
comp_bio->bi_status = ret;
bio_endio(comp_bio);
}
- comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
+ comp_bio = btrfs_bio_alloc(cur_disk_byte);
comp_bio->bi_opf = REQ_OP_READ;
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
@@ -662,7 +795,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret) {
comp_bio->bi_status = ret;
bio_endio(comp_bio);
@@ -733,26 +866,6 @@ struct heuristic_ws {
static struct workspace_manager heuristic_wsm;
-static void heuristic_init_workspace_manager(void)
-{
- btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
-}
-
-static void heuristic_cleanup_workspace_manager(void)
-{
- btrfs_cleanup_workspace_manager(&heuristic_wsm);
-}
-
-static struct list_head *heuristic_get_workspace(unsigned int level)
-{
- return btrfs_get_workspace(&heuristic_wsm, level);
-}
-
-static void heuristic_put_workspace(struct list_head *ws)
-{
- btrfs_put_workspace(&heuristic_wsm, ws);
-}
-
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
@@ -793,12 +906,7 @@ fail:
}
const struct btrfs_compress_op btrfs_heuristic_compress = {
- .init_workspace_manager = heuristic_init_workspace_manager,
- .cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
- .get_workspace = heuristic_get_workspace,
- .put_workspace = heuristic_put_workspace,
- .alloc_workspace = alloc_heuristic_ws,
- .free_workspace = free_heuristic_ws,
+ .workspace_manager = &heuristic_wsm,
};
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
@@ -809,13 +917,44 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zstd_compress,
};
-void btrfs_init_workspace_manager(struct workspace_manager *wsm,
- const struct btrfs_compress_op *ops)
+static struct list_head *alloc_workspace(int type, unsigned int level)
{
- struct list_head *workspace;
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
+ case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
+ case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level);
+ case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+static void free_workspace(int type, struct list_head *ws)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
+ case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
+ case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws);
+ case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
- wsm->ops = ops;
+static void btrfs_init_workspace_manager(int type)
+{
+ struct workspace_manager *wsm;
+ struct list_head *workspace;
+ wsm = btrfs_compress_op[type]->workspace_manager;
INIT_LIST_HEAD(&wsm->idle_ws);
spin_lock_init(&wsm->ws_lock);
atomic_set(&wsm->total_ws, 0);
@@ -825,7 +964,7 @@ void btrfs_init_workspace_manager(struct workspace_manager *wsm,
* Preallocate one workspace for each compression type so we can
* guarantee forward progress in the worst case
*/
- workspace = wsm->ops->alloc_workspace(0);
+ workspace = alloc_workspace(type, 0);
if (IS_ERR(workspace)) {
pr_warn(
"BTRFS: cannot preallocate compression workspace, will try later\n");
@@ -836,14 +975,16 @@ void btrfs_init_workspace_manager(struct workspace_manager *wsm,
}
}
-void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
+static void btrfs_cleanup_workspace_manager(int type)
{
+ struct workspace_manager *wsman;
struct list_head *ws;
+ wsman = btrfs_compress_op[type]->workspace_manager;
while (!list_empty(&wsman->idle_ws)) {
ws = wsman->idle_ws.next;
list_del(ws);
- wsman->ops->free_workspace(ws);
+ free_workspace(type, ws);
atomic_dec(&wsman->total_ws);
}
}
@@ -854,9 +995,9 @@ void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
- unsigned int level)
+struct list_head *btrfs_get_workspace(int type, unsigned int level)
{
+ struct workspace_manager *wsm;
struct list_head *workspace;
int cpus = num_online_cpus();
unsigned nofs_flag;
@@ -866,6 +1007,7 @@ struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
wait_queue_head_t *ws_wait;
int *free_ws;
+ wsm = btrfs_compress_op[type]->workspace_manager;
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws;
@@ -901,7 +1043,7 @@ again:
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
- workspace = wsm->ops->alloc_workspace(level);
+ workspace = alloc_workspace(type, level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
@@ -934,21 +1076,34 @@ again:
static struct list_head *get_workspace(int type, int level)
{
- return btrfs_compress_op[type]->get_workspace(level);
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
+ case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
+ case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level);
+ case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
}
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
-void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
+void btrfs_put_workspace(int type, struct list_head *ws)
{
+ struct workspace_manager *wsm;
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
+ wsm = btrfs_compress_op[type]->workspace_manager;
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws;
@@ -964,7 +1119,7 @@ void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
}
spin_unlock(ws_lock);
- wsm->ops->free_workspace(ws);
+ free_workspace(type, ws);
atomic_dec(total_ws);
wake:
cond_wake_up(ws_wait);
@@ -972,7 +1127,18 @@ wake:
static void put_workspace(int type, struct list_head *ws)
{
- return btrfs_compress_op[type]->put_workspace(ws);
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
+ case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
+ case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws);
+ case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
}
/*
@@ -1009,11 +1175,10 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
struct list_head *workspace;
int ret;
+ level = btrfs_compress_set_level(type, level);
workspace = get_workspace(type, level);
- ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
- start, pages,
- out_pages,
- total_in, total_out);
+ ret = compression_compress_pages(type, workspace, mapping, start, pages,
+ out_pages, total_in, total_out);
put_workspace(type, workspace);
return ret;
}
@@ -1039,7 +1204,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
int type = cb->compress_type;
workspace = get_workspace(type, 0);
- ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
+ ret = compression_decompress_bio(type, workspace, cb);
put_workspace(type, workspace);
return ret;
@@ -1057,9 +1222,8 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
int ret;
workspace = get_workspace(type, 0);
- ret = btrfs_compress_op[type]->decompress(workspace, data_in,
- dest_page, start_byte,
- srclen, destlen);
+ ret = compression_decompress(type, workspace, data_in, dest_page,
+ start_byte, srclen, destlen);
put_workspace(type, workspace);
return ret;
@@ -1067,18 +1231,18 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
void __init btrfs_init_compress(void)
{
- int i;
-
- for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
- btrfs_compress_op[i]->init_workspace_manager();
+ btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
+ btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
+ btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
+ zstd_init_workspace_manager();
}
void __cold btrfs_exit_compress(void)
{
- int i;
-
- for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
- btrfs_compress_op[i]->cleanup_workspace_manager();
+ btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
+ btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
+ btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
+ zstd_cleanup_workspace_manager();
}
/*
@@ -1580,7 +1744,23 @@ unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
level = 0;
}
- level = btrfs_compress_op[type]->set_level(level);
+ level = btrfs_compress_set_level(type, level);
+
+ return level;
+}
+
+/*
+ * Adjust @level according to the limits of the compression algorithm or
+ * fallback to default
+ */
+unsigned int btrfs_compress_set_level(int type, unsigned level)
+{
+ const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+
+ if (level == 0)
+ level = ops->default_level;
+ else
+ level = min(level, ops->max_level);
return level;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 9976fe0f7526..d253f7aa8ed5 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -61,7 +61,7 @@ struct compressed_bio {
* the start of a variable length array of checksums only
* used by reads
*/
- u32 sums;
+ u8 sums[];
};
static inline unsigned int btrfs_compress_type(unsigned int type_level)
@@ -93,7 +93,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long compressed_len,
struct page **compressed_pages,
unsigned long nr_pages,
- unsigned int write_flags);
+ unsigned int write_flags,
+ struct cgroup_subsys_state *blkcg_css);
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
@@ -104,11 +105,10 @@ enum btrfs_compression_type {
BTRFS_COMPRESS_ZLIB = 1,
BTRFS_COMPRESS_LZO = 2,
BTRFS_COMPRESS_ZSTD = 3,
- BTRFS_COMPRESS_TYPES = 3,
+ BTRFS_NR_COMPRESS_TYPES = 4,
};
struct workspace_manager {
- const struct btrfs_compress_op *ops;
struct list_head idle_ws;
spinlock_t ws_lock;
/* Number of free workspaces */
@@ -119,53 +119,18 @@ struct workspace_manager {
wait_queue_head_t ws_wait;
};
-void btrfs_init_workspace_manager(struct workspace_manager *wsm,
- const struct btrfs_compress_op *ops);
-struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
- unsigned int level);
-void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws);
-void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm);
+struct list_head *btrfs_get_workspace(int type, unsigned int level);
+void btrfs_put_workspace(int type, struct list_head *ws);
struct btrfs_compress_op {
- void (*init_workspace_manager)(void);
-
- void (*cleanup_workspace_manager)(void);
-
- struct list_head *(*get_workspace)(unsigned int level);
-
- void (*put_workspace)(struct list_head *ws);
-
- struct list_head *(*alloc_workspace)(unsigned int level);
-
- void (*free_workspace)(struct list_head *workspace);
-
- int (*compress_pages)(struct list_head *workspace,
- struct address_space *mapping,
- u64 start,
- struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out);
-
- int (*decompress_bio)(struct list_head *workspace,
- struct compressed_bio *cb);
-
- int (*decompress)(struct list_head *workspace,
- unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen);
-
- /*
- * This bounds the level set by the user to be within range of a
- * particular compression type. It returns the level that will be used
- * if the level is out of bounds or the default if 0 is passed in.
- */
- unsigned int (*set_level)(unsigned int level);
+ struct workspace_manager *workspace_manager;
+ /* Maximum level supported by the compression algorithm */
+ unsigned int max_level;
+ unsigned int default_level;
};
/* The heuristic workspaces are managed via the 0th workspace manager */
-#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1)
+#define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES
extern const struct btrfs_compress_op btrfs_heuristic_compress;
extern const struct btrfs_compress_op btrfs_zlib_compress;
@@ -173,6 +138,9 @@ extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
+bool btrfs_compress_is_valid_type(const char *str, size_t len);
+
+unsigned int btrfs_compress_set_level(int type, unsigned level);
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 324df36d28bf..24658b5a5787 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -21,44 +21,60 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *ins_key, struct btrfs_path *path,
int data_size, int extend);
static int push_node_left(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct extent_buffer *dst,
struct extent_buffer *src, int empty);
static int balance_node_right(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
int level, int slot);
-struct btrfs_path *btrfs_alloc_path(void)
+static const struct btrfs_csums {
+ u16 size;
+ const char *name;
+ const char *driver;
+} btrfs_csums[] = {
+ [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
+ [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
+ [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
+ [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
+ .driver = "blake2b-256" },
+};
+
+int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
- return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+ u16 t = btrfs_super_csum_type(s);
+ /*
+ * csum type is validated at mount time
+ */
+ return btrfs_csums[t].size;
+}
+
+const char *btrfs_super_csum_name(u16 csum_type)
+{
+ /* csum type is validated at mount time */
+ return btrfs_csums[csum_type].name;
}
/*
- * set all locked nodes in the path to blocking locks. This should
- * be done before scheduling
+ * Return driver name if defined, otherwise the name that's also a valid driver
+ * name
*/
-noinline void btrfs_set_path_blocking(struct btrfs_path *p)
+const char *btrfs_super_csum_driver(u16 csum_type)
{
- int i;
- for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
- if (!p->nodes[i] || !p->locks[i])
- continue;
- /*
- * If we currently have a spinning reader or writer lock this
- * will bump the count of blocking holders and drop the
- * spinlock.
- */
- if (p->locks[i] == BTRFS_READ_LOCK) {
- btrfs_set_lock_blocking_read(p->nodes[i]);
- p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
- } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
- btrfs_set_lock_blocking_write(p->nodes[i]);
- p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
- }
- }
+ /* csum type is validated at mount time */
+ return btrfs_csums[csum_type].driver ?:
+ btrfs_csums[csum_type].name;
+}
+
+size_t __const btrfs_get_num_csums(void)
+{
+ return ARRAY_SIZE(btrfs_csums);
+}
+
+struct btrfs_path *btrfs_alloc_path(void)
+{
+ return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
}
/* this also releases the path */
@@ -363,7 +379,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
for (node = rb_first(tm_root); node; node = next) {
next = rb_next(node);
tm = rb_entry(node, struct tree_mod_elem, node);
- if (tm->seq > min_seq)
+ if (tm->seq >= min_seq)
continue;
rb_erase(node, tm_root);
kfree(tm);
@@ -378,8 +394,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
* The 'start address' is the logical address of the *new* root node
* for root replace operations, or the logical address of the affected
* block for all other operations.
- *
- * Note: must be called with write lock for fs_info::tree_mod_log_lock.
*/
static noinline int
__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
@@ -389,6 +403,8 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
struct rb_node *parent = NULL;
struct tree_mod_elem *cur;
+ lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
+
tm->seq = btrfs_inc_tree_mod_seq(fs_info);
tm_root = &fs_info->tree_mod_log;
@@ -726,11 +742,11 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
return __tree_mod_log_search(fs_info, start, min_seq, 0);
}
-static noinline int
-tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
struct extent_buffer *src, unsigned long dst_offset,
unsigned long src_offset, int nr_items)
{
+ struct btrfs_fs_info *fs_info = dst->fs_info;
int ret = 0;
struct tree_mod_elem **tm_list = NULL;
struct tree_mod_elem **tm_list_add, **tm_list_rem;
@@ -950,7 +966,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (new_flags != 0) {
int level = btrfs_header_level(buf);
- ret = btrfs_set_disk_extent_flags(trans, fs_info,
+ ret = btrfs_set_disk_extent_flags(trans,
buf->start,
buf->len,
new_flags, level, 0);
@@ -970,7 +986,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (ret)
return ret;
}
- clean_tree_block(fs_info, buf);
+ btrfs_clean_tree_block(buf);
*last_ref = 1;
}
return 0;
@@ -1105,7 +1121,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
parent_start = buf->start;
- extent_buffer_get(cow);
+ atomic_inc(&cow->refs);
ret = tree_mod_log_insert_root(root->node, cow, 1);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
@@ -1345,6 +1361,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
struct tree_mod_elem *tm;
struct extent_buffer *eb = NULL;
struct extent_buffer *eb_root;
+ u64 eb_root_owner = 0;
struct extent_buffer *old;
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
@@ -1382,6 +1399,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(old);
}
} else if (old_root) {
+ eb_root_owner = btrfs_header_owner(eb_root);
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(fs_info, logical);
@@ -1398,7 +1416,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
- btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
+ btrfs_set_header_owner(eb, eb_root_owner);
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
@@ -1541,7 +1559,7 @@ static int comp_keys(const struct btrfs_disk_key *disk,
/*
* same as comp_keys only with two btrfs_key's
*/
-int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
+int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
{
if (k1->objectid > k2->objectid)
return 1;
@@ -1792,9 +1810,8 @@ static void root_sub_used(struct btrfs_root *root, u32 size)
/* given a node and slot number, this reads the blocks it points to. The
* extent buffer is returned with a reference taken (but unlocked).
*/
-static noinline struct extent_buffer *
-read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
- int slot)
+struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
+ int slot)
{
int level = btrfs_header_level(parent);
struct extent_buffer *eb;
@@ -1806,7 +1823,7 @@ read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
BUG_ON(level == 0);
btrfs_node_key_to_cpu(parent, &first_key, slot);
- eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
+ eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
btrfs_node_ptr_generation(parent, slot),
level - 1, &first_key);
if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
@@ -1863,7 +1880,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
return 0;
/* promote the child to a root */
- child = read_node_slot(fs_info, mid, 0);
+ child = btrfs_read_node_slot(mid, 0);
if (IS_ERR(child)) {
ret = PTR_ERR(child);
btrfs_handle_fs_error(fs_info, ret, NULL);
@@ -1888,7 +1905,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
path->locks[level] = 0;
path->nodes[level] = NULL;
- clean_tree_block(fs_info, mid);
+ btrfs_clean_tree_block(mid);
btrfs_tree_unlock(mid);
/* once for the path */
free_extent_buffer(mid);
@@ -1903,7 +1920,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
return 0;
- left = read_node_slot(fs_info, parent, pslot - 1);
+ left = btrfs_read_node_slot(parent, pslot - 1);
if (IS_ERR(left))
left = NULL;
@@ -1918,7 +1935,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
}
- right = read_node_slot(fs_info, parent, pslot + 1);
+ right = btrfs_read_node_slot(parent, pslot + 1);
if (IS_ERR(right))
right = NULL;
@@ -1936,7 +1953,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* first, try to make some room in the middle buffer */
if (left) {
orig_slot += btrfs_header_nritems(left);
- wret = push_node_left(trans, fs_info, left, mid, 1);
+ wret = push_node_left(trans, left, mid, 1);
if (wret < 0)
ret = wret;
}
@@ -1945,11 +1962,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
* then try to empty the right most buffer into the middle
*/
if (right) {
- wret = push_node_left(trans, fs_info, mid, right, 1);
+ wret = push_node_left(trans, mid, right, 1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (btrfs_header_nritems(right) == 0) {
- clean_tree_block(fs_info, right);
+ btrfs_clean_tree_block(right);
btrfs_tree_unlock(right);
del_ptr(root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
@@ -1981,20 +1998,20 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_handle_fs_error(fs_info, ret, NULL);
goto enospc;
}
- wret = balance_node_right(trans, fs_info, mid, left);
+ wret = balance_node_right(trans, mid, left);
if (wret < 0) {
ret = wret;
goto enospc;
}
if (wret == 1) {
- wret = push_node_left(trans, fs_info, left, mid, 1);
+ wret = push_node_left(trans, left, mid, 1);
if (wret < 0)
ret = wret;
}
BUG_ON(wret == 1);
}
if (btrfs_header_nritems(mid) == 0) {
- clean_tree_block(fs_info, mid);
+ btrfs_clean_tree_block(mid);
btrfs_tree_unlock(mid);
del_ptr(root, path, level + 1, pslot);
root_sub_used(root, mid->len);
@@ -2015,7 +2032,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the path */
if (left) {
if (btrfs_header_nritems(left) > orig_slot) {
- extent_buffer_get(left);
+ atomic_inc(&left->refs);
/* left was locked after cow */
path->nodes[level] = left;
path->slots[level + 1] -= 1;
@@ -2078,7 +2095,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (!parent)
return 1;
- left = read_node_slot(fs_info, parent, pslot - 1);
+ left = btrfs_read_node_slot(parent, pslot - 1);
if (IS_ERR(left))
left = NULL;
@@ -2098,8 +2115,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (ret)
wret = 1;
else {
- wret = push_node_left(trans, fs_info,
- left, mid, 0);
+ wret = push_node_left(trans, left, mid, 0);
}
}
if (wret < 0)
@@ -2131,7 +2147,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(left);
free_extent_buffer(left);
}
- right = read_node_slot(fs_info, parent, pslot + 1);
+ right = btrfs_read_node_slot(parent, pslot + 1);
if (IS_ERR(right))
right = NULL;
@@ -2154,8 +2170,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (ret)
wret = 1;
else {
- wret = balance_node_right(trans, fs_info,
- right, mid);
+ wret = balance_node_right(trans, right, mid);
}
}
if (wret < 0)
@@ -2360,32 +2375,6 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
}
/*
- * This releases any locks held in the path starting at level and
- * going all the way up to the root.
- *
- * btrfs_search_slot will keep the lock held on higher nodes in a few
- * corner cases, such as COW of the block at slot zero in the node. This
- * ignores those rules, and it should only be called when there are no
- * more updates to be done higher up in the tree.
- */
-noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
-{
- int i;
-
- if (path->keep_locks)
- return;
-
- for (i = level; i < BTRFS_MAX_LEVEL; i++) {
- if (!path->nodes[i])
- continue;
- if (!path->locks[i])
- continue;
- btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
- path->locks[i] = 0;
- }
-}
-
-/*
* helper function for btrfs_search_slot. The goal is to find a block
* in cache without setting the path to blocking. If we find the block
* we return zero and the path is unchanged.
@@ -2416,6 +2405,16 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (tmp) {
/* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+ /*
+ * Do extra check for first_key, eb can be stale due to
+ * being cached, read from scrub, or have multiple
+ * parents (shared tree blocks).
+ */
+ if (btrfs_verify_level_key(tmp,
+ parent_level - 1, &first_key, gen)) {
+ free_extent_buffer(tmp);
+ return -EUCLEAN;
+ }
*eb_ret = tmp;
return 0;
}
@@ -2623,7 +2622,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
} else {
b = root->commit_root;
- extent_buffer_get(b);
+ atomic_inc(&b->refs);
}
level = btrfs_header_level(b);
/*
@@ -2706,7 +2705,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_path *p,
int ins_len, int cow)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
int slot;
int ret;
@@ -2757,12 +2755,10 @@ again:
}
while (b) {
+ int dec = 0;
+
level = btrfs_header_level(b);
- /*
- * setup the path here so we can release it under lock
- * contention with the cow code
- */
if (cow) {
bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
@@ -2833,78 +2829,10 @@ cow_done:
if (ret < 0)
goto done;
- if (level != 0) {
- int dec = 0;
- if (ret && slot > 0) {
- dec = 1;
- slot -= 1;
- }
- p->slots[level] = slot;
- err = setup_nodes_for_search(trans, root, p, b, level,
- ins_len, &write_lock_level);
- if (err == -EAGAIN)
- goto again;
- if (err) {
- ret = err;
- goto done;
- }
- b = p->nodes[level];
- slot = p->slots[level];
-
- /*
- * slot 0 is special, if we change the key
- * we have to update the parent pointer
- * which means we must have a write lock
- * on the parent
- */
- if (slot == 0 && ins_len &&
- write_lock_level < level + 1) {
- write_lock_level = level + 1;
- btrfs_release_path(p);
- goto again;
- }
-
- unlock_up(p, level, lowest_unlock,
- min_write_lock_level, &write_lock_level);
-
- if (level == lowest_level) {
- if (dec)
- p->slots[level]++;
- goto done;
- }
-
- err = read_block_for_search(root, p, &b, level,
- slot, key);
- if (err == -EAGAIN)
- goto again;
- if (err) {
- ret = err;
- goto done;
- }
-
- if (!p->skip_locking) {
- level = btrfs_header_level(b);
- if (level <= write_lock_level) {
- err = btrfs_try_tree_write_lock(b);
- if (!err) {
- btrfs_set_path_blocking(p);
- btrfs_tree_lock(b);
- }
- p->locks[level] = BTRFS_WRITE_LOCK;
- } else {
- err = btrfs_tree_read_lock_atomic(b);
- if (!err) {
- btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
- }
- p->locks[level] = BTRFS_READ_LOCK;
- }
- p->nodes[level] = b;
- }
- } else {
+ if (level == 0) {
p->slots[level] = slot;
if (ins_len > 0 &&
- btrfs_leaf_free_space(fs_info, b) < ins_len) {
+ btrfs_leaf_free_space(b) < ins_len) {
if (write_lock_level < 1) {
write_lock_level = 1;
btrfs_release_path(p);
@@ -2926,6 +2854,67 @@ cow_done:
min_write_lock_level, NULL);
goto done;
}
+ if (ret && slot > 0) {
+ dec = 1;
+ slot--;
+ }
+ p->slots[level] = slot;
+ err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
+ &write_lock_level);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+ b = p->nodes[level];
+ slot = p->slots[level];
+
+ /*
+ * Slot 0 is special, if we change the key we have to update
+ * the parent pointer which means we must have a write lock on
+ * the parent
+ */
+ if (slot == 0 && ins_len && write_lock_level < level + 1) {
+ write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
+ unlock_up(p, level, lowest_unlock, min_write_lock_level,
+ &write_lock_level);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(root, p, &b, level, slot, key);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ if (!p->skip_locking) {
+ level = btrfs_header_level(b);
+ if (level <= write_lock_level) {
+ if (!btrfs_try_tree_write_lock(b)) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_lock(b);
+ }
+ p->locks[level] = BTRFS_WRITE_LOCK;
+ } else {
+ if (!btrfs_tree_read_lock_atomic(b)) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_read_lock(b);
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ }
+ p->nodes[level] = b;
+ }
}
ret = 1;
done:
@@ -2982,6 +2971,8 @@ again:
p->locks[level] = BTRFS_READ_LOCK;
while (b) {
+ int dec = 0;
+
level = btrfs_header_level(b);
p->nodes[level] = b;
@@ -3002,48 +2993,45 @@ again:
if (ret < 0)
goto done;
- if (level != 0) {
- int dec = 0;
- if (ret && slot > 0) {
- dec = 1;
- slot -= 1;
- }
+ if (level == 0) {
p->slots[level] = slot;
unlock_up(p, level, lowest_unlock, 0, NULL);
+ goto done;
+ }
- if (level == lowest_level) {
- if (dec)
- p->slots[level]++;
- goto done;
- }
+ if (ret && slot > 0) {
+ dec = 1;
+ slot--;
+ }
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
- err = read_block_for_search(root, p, &b, level,
- slot, key);
- if (err == -EAGAIN)
- goto again;
- if (err) {
- ret = err;
- goto done;
- }
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
- level = btrfs_header_level(b);
- err = btrfs_tree_read_lock_atomic(b);
- if (!err) {
- btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
- }
- b = tree_mod_log_rewind(fs_info, p, b, time_seq);
- if (!b) {
- ret = -ENOMEM;
- goto done;
- }
- p->locks[level] = BTRFS_READ_LOCK;
- p->nodes[level] = b;
- } else {
- p->slots[level] = slot;
- unlock_up(p, level, lowest_unlock, 0, NULL);
+ err = read_block_for_search(root, p, &b, level, slot, key);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ level = btrfs_header_level(b);
+ if (!btrfs_tree_read_lock_atomic(b)) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_read_lock(b);
+ }
+ b = tree_mod_log_rewind(fs_info, p, b, time_seq);
+ if (!b) {
+ ret = -ENOMEM;
goto done;
}
+ p->locks[level] = BTRFS_READ_LOCK;
+ p->nodes[level] = b;
}
ret = 1;
done:
@@ -3181,11 +3169,31 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
slot = path->slots[0];
if (slot > 0) {
btrfs_item_key(eb, &disk_key, slot - 1);
- BUG_ON(comp_keys(&disk_key, new_key) >= 0);
+ if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
+ btrfs_crit(fs_info,
+ "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
+ slot, btrfs_disk_key_objectid(&disk_key),
+ btrfs_disk_key_type(&disk_key),
+ btrfs_disk_key_offset(&disk_key),
+ new_key->objectid, new_key->type,
+ new_key->offset);
+ btrfs_print_leaf(eb);
+ BUG();
+ }
}
if (slot < btrfs_header_nritems(eb) - 1) {
btrfs_item_key(eb, &disk_key, slot + 1);
- BUG_ON(comp_keys(&disk_key, new_key) <= 0);
+ if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
+ btrfs_crit(fs_info,
+ "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
+ slot, btrfs_disk_key_objectid(&disk_key),
+ btrfs_disk_key_type(&disk_key),
+ btrfs_disk_key_offset(&disk_key),
+ new_key->objectid, new_key->type,
+ new_key->offset);
+ btrfs_print_leaf(eb);
+ BUG();
+ }
}
btrfs_cpu_key_to_disk(&disk_key, new_key);
@@ -3203,10 +3211,10 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
* error, and > 0 if there was no room in the left hand block.
*/
static int push_node_left(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct extent_buffer *dst,
struct extent_buffer *src, int empty)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int push_items = 0;
int src_nritems;
int dst_nritems;
@@ -3239,8 +3247,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
- ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
- push_items);
+ ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3278,10 +3285,10 @@ static int push_node_left(struct btrfs_trans_handle *trans,
* this will only push up to 1/2 the contents of the left node over
*/
static int balance_node_right(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct extent_buffer *dst,
struct extent_buffer *src)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int push_items = 0;
int max_push;
int src_nritems;
@@ -3315,8 +3322,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
- ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
- src_nritems - push_items, push_items);
+ ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
+ push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3389,7 +3396,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
free_extent_buffer(old);
add_root_to_dirty_list(root);
- extent_buffer_get(c);
+ atomic_inc(&c->refs);
path->nodes[level] = c;
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
path->slots[level] = 0;
@@ -3404,7 +3411,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
* blocknr is the block the key points to.
*/
static void insert_ptr(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+ struct btrfs_path *path,
struct btrfs_disk_key *key, u64 bytenr,
int slot, int level)
{
@@ -3417,7 +3424,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
- BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+ BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
if (slot != nritems) {
if (level) {
ret = tree_mod_log_insert_move(lower, slot + 1, slot,
@@ -3501,7 +3508,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
root_add_used(root, fs_info->nodesize);
ASSERT(btrfs_header_level(c) == level);
- ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
+ ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3517,7 +3524,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
btrfs_mark_buffer_dirty(split);
- insert_ptr(trans, fs_info, path, &disk_key, split->start,
+ insert_ptr(trans, path, &disk_key, split->start,
path->slots[level + 1] + 1, level + 1);
if (path->slots[level] >= mid) {
@@ -3549,7 +3556,7 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
if (!nr)
return 0;
- btrfs_init_map_token(&token);
+ btrfs_init_map_token(&token, l);
start_item = btrfs_item_nr(start);
end_item = btrfs_item_nr(end);
data_len = btrfs_token_item_offset(l, start_item, &token) +
@@ -3565,9 +3572,9 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
* the start of the leaf data. IOW, how much room
* the leaf has left for both items and data
*/
-noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf)
+noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
int nritems = btrfs_header_nritems(leaf);
int ret;
@@ -3586,13 +3593,13 @@ noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
* min slot controls the lowest index we're willing to push to the
* right. We'll push up to and including min_slot, but no lower
*/
-static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
+static noinline int __push_leaf_right(struct btrfs_path *path,
int data_size, int empty,
struct extent_buffer *right,
int free_space, u32 left_nritems,
u32 min_slot)
{
+ struct btrfs_fs_info *fs_info = right->fs_info;
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *upper = path->nodes[1];
struct btrfs_map_token token;
@@ -3607,8 +3614,6 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
u32 data_end;
u32 this_item_size;
- btrfs_init_map_token(&token);
-
if (empty)
nr = 0;
else
@@ -3626,7 +3631,8 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
if (path->slots[0] > i)
break;
if (path->slots[0] == i) {
- int space = btrfs_leaf_free_space(fs_info, left);
+ int space = btrfs_leaf_free_space(left);
+
if (space + push_space * 2 > free_space)
break;
}
@@ -3655,10 +3661,10 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
right_nritems = btrfs_header_nritems(right);
push_space = btrfs_item_end_nr(left, left_nritems - push_items);
- push_space -= leaf_data_end(fs_info, left);
+ push_space -= leaf_data_end(left);
/* make room in the right data area */
- data_end = leaf_data_end(fs_info, right);
+ data_end = leaf_data_end(right);
memmove_extent_buffer(right,
BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
BTRFS_LEAF_DATA_OFFSET + data_end,
@@ -3667,7 +3673,7 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
/* copy from the left data area */
copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
- BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
+ BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
push_space);
memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
@@ -3680,6 +3686,7 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
push_items * sizeof(struct btrfs_item));
/* update the item pointers */
+ btrfs_init_map_token(&token, right);
right_nritems += push_items;
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
@@ -3695,7 +3702,7 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
if (left_nritems)
btrfs_mark_buffer_dirty(left);
else
- clean_tree_block(fs_info, left);
+ btrfs_clean_tree_block(left);
btrfs_mark_buffer_dirty(right);
@@ -3707,7 +3714,7 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
if (path->slots[0] >= left_nritems) {
path->slots[0] -= left_nritems;
if (btrfs_header_nritems(path->nodes[0]) == 0)
- clean_tree_block(fs_info, path->nodes[0]);
+ btrfs_clean_tree_block(path->nodes[0]);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3739,7 +3746,6 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
int min_data_size, int data_size,
int empty, u32 min_slot)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *right;
struct extent_buffer *upper;
@@ -3758,7 +3764,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_locked(path->nodes[1]);
- right = read_node_slot(fs_info, upper, slot + 1);
+ right = btrfs_read_node_slot(upper, slot + 1);
/*
* slot + 1 is not valid or we fail to read the right node,
* no big deal, just return.
@@ -3769,7 +3775,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_tree_lock(right);
btrfs_set_lock_blocking_write(right);
- free_space = btrfs_leaf_free_space(fs_info, right);
+ free_space = btrfs_leaf_free_space(right);
if (free_space < data_size)
goto out_unlock;
@@ -3779,7 +3785,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (ret)
goto out_unlock;
- free_space = btrfs_leaf_free_space(fs_info, right);
+ free_space = btrfs_leaf_free_space(right);
if (free_space < data_size)
goto out_unlock;
@@ -3800,7 +3806,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
return 0;
}
- return __push_leaf_right(fs_info, path, min_data_size, empty,
+ return __push_leaf_right(path, min_data_size, empty,
right, free_space, left_nritems, min_slot);
out_unlock:
btrfs_tree_unlock(right);
@@ -3816,12 +3822,12 @@ out_unlock:
* item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
* items
*/
-static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, int data_size,
+static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
int empty, struct extent_buffer *left,
int free_space, u32 right_nritems,
u32 max_slot)
{
+ struct btrfs_fs_info *fs_info = left->fs_info;
struct btrfs_disk_key disk_key;
struct extent_buffer *right = path->nodes[0];
int i;
@@ -3835,8 +3841,6 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
u32 old_left_item_size;
struct btrfs_map_token token;
- btrfs_init_map_token(&token);
-
if (empty)
nr = min(right_nritems, max_slot);
else
@@ -3849,7 +3853,8 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
if (path->slots[0] < i)
break;
if (path->slots[0] == i) {
- int space = btrfs_leaf_free_space(fs_info, right);
+ int space = btrfs_leaf_free_space(right);
+
if (space + push_space * 2 > free_space)
break;
}
@@ -3882,13 +3887,14 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
btrfs_item_offset_nr(right, push_items - 1);
copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
- leaf_data_end(fs_info, left) - push_space,
+ leaf_data_end(left) - push_space,
BTRFS_LEAF_DATA_OFFSET +
btrfs_item_offset_nr(right, push_items - 1),
push_space);
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
+ btrfs_init_map_token(&token, left);
old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
u32 ioff;
@@ -3909,17 +3915,19 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
if (push_items < right_nritems) {
push_space = btrfs_item_offset_nr(right, push_items - 1) -
- leaf_data_end(fs_info, right);
+ leaf_data_end(right);
memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
BTRFS_LEAF_DATA_OFFSET +
- leaf_data_end(fs_info, right), push_space);
+ leaf_data_end(right), push_space);
memmove_extent_buffer(right, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(push_items),
(btrfs_header_nritems(right) - push_items) *
sizeof(struct btrfs_item));
}
+
+ btrfs_init_map_token(&token, right);
right_nritems -= push_items;
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
@@ -3935,7 +3943,7 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
if (right_nritems)
btrfs_mark_buffer_dirty(right);
else
- clean_tree_block(fs_info, right);
+ btrfs_clean_tree_block(right);
btrfs_item_key(right, &disk_key, 0);
fixup_low_keys(path, &disk_key, 1);
@@ -3972,7 +3980,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int min_data_size,
int data_size, int empty, u32 max_slot)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *right = path->nodes[0];
struct extent_buffer *left;
int slot;
@@ -3992,7 +3999,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_locked(path->nodes[1]);
- left = read_node_slot(fs_info, path->nodes[1], slot - 1);
+ left = btrfs_read_node_slot(path->nodes[1], slot - 1);
/*
* slot - 1 is not valid or we fail to read the left node,
* no big deal, just return.
@@ -4003,7 +4010,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_tree_lock(left);
btrfs_set_lock_blocking_write(left);
- free_space = btrfs_leaf_free_space(fs_info, left);
+ free_space = btrfs_leaf_free_space(left);
if (free_space < data_size) {
ret = 1;
goto out;
@@ -4019,13 +4026,13 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- free_space = btrfs_leaf_free_space(fs_info, left);
+ free_space = btrfs_leaf_free_space(left);
if (free_space < data_size) {
ret = 1;
goto out;
}
- return __push_leaf_left(fs_info, path, min_data_size,
+ return __push_leaf_left(path, min_data_size,
empty, left, free_space, right_nritems,
max_slot);
out:
@@ -4039,23 +4046,21 @@ out:
* available for the resulting leaf level of the path.
*/
static noinline void copy_for_split(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct extent_buffer *l,
struct extent_buffer *right,
int slot, int mid, int nritems)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int data_copy_size;
int rt_data_off;
int i;
struct btrfs_disk_key disk_key;
struct btrfs_map_token token;
- btrfs_init_map_token(&token);
-
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
+ data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(mid),
@@ -4064,10 +4069,11 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
copy_extent_buffer(right, l,
BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
data_copy_size, BTRFS_LEAF_DATA_OFFSET +
- leaf_data_end(fs_info, l), data_copy_size);
+ leaf_data_end(l), data_copy_size);
rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
+ btrfs_init_map_token(&token, right);
for (i = 0; i < nritems; i++) {
struct btrfs_item *item = btrfs_item_nr(i);
u32 ioff;
@@ -4079,8 +4085,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(l, mid);
btrfs_item_key(right, &disk_key, 0);
- insert_ptr(trans, fs_info, path, &disk_key, right->start,
- path->slots[1] + 1, 1);
+ insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
btrfs_mark_buffer_dirty(right);
btrfs_mark_buffer_dirty(l);
@@ -4115,7 +4120,6 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
int data_size)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
int progress = 0;
int slot;
@@ -4124,7 +4128,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
slot = path->slots[0];
if (slot < btrfs_header_nritems(path->nodes[0]))
- space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
+ space_needed -= btrfs_leaf_free_space(path->nodes[0]);
/*
* try to push all the items after our slot into the
@@ -4145,14 +4149,14 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
if (path->slots[0] == 0 || path->slots[0] == nritems)
return 0;
- if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
+ if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
return 0;
/* try to push all the items before our slot into the next leaf */
slot = path->slots[0];
space_needed = data_size;
if (slot > 0)
- space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
+ space_needed -= btrfs_leaf_free_space(path->nodes[0]);
ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
if (ret < 0)
return ret;
@@ -4201,7 +4205,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int space_needed = data_size;
if (slot < btrfs_header_nritems(l))
- space_needed -= btrfs_leaf_free_space(fs_info, l);
+ space_needed -= btrfs_leaf_free_space(l);
wret = push_leaf_right(trans, root, path, space_needed,
space_needed, 0, 0);
@@ -4210,8 +4214,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
if (wret) {
space_needed = data_size;
if (slot > 0)
- space_needed -= btrfs_leaf_free_space(fs_info,
- l);
+ space_needed -= btrfs_leaf_free_space(l);
wret = push_leaf_left(trans, root, path, space_needed,
space_needed, 0, (u32)-1);
if (wret < 0)
@@ -4220,7 +4223,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
l = path->nodes[0];
/* did the pushes work? */
- if (btrfs_leaf_free_space(fs_info, l) >= data_size)
+ if (btrfs_leaf_free_space(l) >= data_size)
return 0;
}
@@ -4288,7 +4291,7 @@ again:
if (split == 0) {
if (mid <= slot) {
btrfs_set_header_nritems(right, 0);
- insert_ptr(trans, fs_info, path, &disk_key,
+ insert_ptr(trans, path, &disk_key,
right->start, path->slots[1] + 1, 1);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
@@ -4297,7 +4300,7 @@ again:
path->slots[1] += 1;
} else {
btrfs_set_header_nritems(right, 0);
- insert_ptr(trans, fs_info, path, &disk_key,
+ insert_ptr(trans, path, &disk_key,
right->start, path->slots[1], 1);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
@@ -4314,7 +4317,7 @@ again:
return ret;
}
- copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
+ copy_for_split(trans, path, l, right, slot, mid, nritems);
if (split == 2) {
BUG_ON(num_doubles != 0);
@@ -4327,7 +4330,7 @@ again:
push_for_double:
push_for_double_split(trans, root, path, data_size);
tried_avoid_double = 1;
- if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
+ if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
return 0;
goto again;
}
@@ -4336,7 +4339,6 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int ins_len)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
@@ -4350,7 +4352,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
key.type != BTRFS_EXTENT_CSUM_KEY);
- if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
+ if (btrfs_leaf_free_space(leaf) >= ins_len)
return 0;
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -4377,7 +4379,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
goto err;
/* the leaf has changed, it now has room. return now */
- if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
+ if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
goto err;
if (key.type == BTRFS_EXTENT_DATA_KEY) {
@@ -4400,8 +4402,7 @@ err:
return ret;
}
-static noinline int split_item(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
+static noinline int split_item(struct btrfs_path *path,
const struct btrfs_key *new_key,
unsigned long split_offset)
{
@@ -4416,7 +4417,7 @@ static noinline int split_item(struct btrfs_fs_info *fs_info,
struct btrfs_disk_key disk_key;
leaf = path->nodes[0];
- BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
+ BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
btrfs_set_path_blocking(path);
@@ -4465,7 +4466,7 @@ static noinline int split_item(struct btrfs_fs_info *fs_info,
item_size - split_offset);
btrfs_mark_buffer_dirty(leaf);
- BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
+ BUG_ON(btrfs_leaf_free_space(leaf) < 0);
kfree(buf);
return 0;
}
@@ -4497,7 +4498,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = split_item(root->fs_info, path, new_key, split_offset);
+ ret = split_item(path, new_key, split_offset);
return ret;
}
@@ -4543,8 +4544,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
* off the end of the item or if we shift the item to chop bytes off
* the front.
*/
-void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, u32 new_size, int from_end)
+void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
@@ -4557,8 +4557,6 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
int i;
struct btrfs_map_token token;
- btrfs_init_map_token(&token);
-
leaf = path->nodes[0];
slot = path->slots[0];
@@ -4567,7 +4565,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
return;
nritems = btrfs_header_nritems(leaf);
- data_end = leaf_data_end(fs_info, leaf);
+ data_end = leaf_data_end(leaf);
old_data_start = btrfs_item_offset_nr(leaf, slot);
@@ -4580,6 +4578,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
+ btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
item = btrfs_item_nr(i);
@@ -4633,7 +4632,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
btrfs_set_item_size(leaf, item, new_size);
btrfs_mark_buffer_dirty(leaf);
- if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
+ if (btrfs_leaf_free_space(leaf) < 0) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4642,8 +4641,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
/*
* make the item pointed to by the path bigger, data_size is the added size.
*/
-void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- u32 data_size)
+void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
{
int slot;
struct extent_buffer *leaf;
@@ -4655,14 +4653,12 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
int i;
struct btrfs_map_token token;
- btrfs_init_map_token(&token);
-
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
- data_end = leaf_data_end(fs_info, leaf);
+ data_end = leaf_data_end(leaf);
- if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
+ if (btrfs_leaf_free_space(leaf) < data_size) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4672,15 +4668,16 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
BUG_ON(slot < 0);
if (slot >= nritems) {
btrfs_print_leaf(leaf);
- btrfs_crit(fs_info, "slot %d too large, nritems %d",
+ btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
slot, nritems);
- BUG_ON(1);
+ BUG();
}
/*
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
+ btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
item = btrfs_item_nr(i);
@@ -4701,7 +4698,7 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
btrfs_set_item_size(leaf, item, old_size + data_size);
btrfs_mark_buffer_dirty(leaf);
- if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
+ if (btrfs_leaf_free_space(leaf) < 0) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4732,21 +4729,20 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
}
btrfs_unlock_up_safe(path, 1);
- btrfs_init_map_token(&token);
-
leaf = path->nodes[0];
slot = path->slots[0];
nritems = btrfs_header_nritems(leaf);
- data_end = leaf_data_end(fs_info, leaf);
+ data_end = leaf_data_end(leaf);
- if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
+ if (btrfs_leaf_free_space(leaf) < total_size) {
btrfs_print_leaf(leaf);
btrfs_crit(fs_info, "not enough freespace need %u have %d",
- total_size, btrfs_leaf_free_space(fs_info, leaf));
+ total_size, btrfs_leaf_free_space(leaf));
BUG();
}
+ btrfs_init_map_token(&token, leaf);
if (slot != nritems) {
unsigned int old_data = btrfs_item_end_nr(leaf, slot);
@@ -4754,7 +4750,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
btrfs_print_leaf(leaf);
btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
slot, old_data, data_end);
- BUG_ON(1);
+ BUG();
}
/*
* item0..itemN ... dataN.offset..dataN.size .. data0.size
@@ -4794,7 +4790,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
btrfs_set_header_nritems(leaf, nritems + nr);
btrfs_mark_buffer_dirty(leaf);
- if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
+ if (btrfs_leaf_free_space(leaf) < 0) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4933,7 +4929,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
- extent_buffer_get(leaf);
+ atomic_inc(&leaf->refs);
btrfs_free_tree_block(trans, root, leaf, 0, 1);
free_extent_buffer_stale(leaf);
}
@@ -4953,9 +4949,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int wret;
int i;
u32 nritems;
- struct btrfs_map_token token;
-
- btrfs_init_map_token(&token);
leaf = path->nodes[0];
last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
@@ -4966,13 +4959,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
nritems = btrfs_header_nritems(leaf);
if (slot + nr != nritems) {
- int data_end = leaf_data_end(fs_info, leaf);
+ int data_end = leaf_data_end(leaf);
+ struct btrfs_map_token token;
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + dsize,
BTRFS_LEAF_DATA_OFFSET + data_end,
last_off - data_end);
+ btrfs_init_map_token(&token, leaf);
for (i = slot + nr; i < nritems; i++) {
u32 ioff;
@@ -4996,7 +4991,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_header_level(leaf, 0);
} else {
btrfs_set_path_blocking(path);
- clean_tree_block(fs_info, leaf);
+ btrfs_clean_tree_block(leaf);
btrfs_del_leaf(trans, root, path, leaf);
}
} else {
@@ -5015,7 +5010,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* for possible call to del_ptr below
*/
slot = path->slots[1];
- extent_buffer_get(leaf);
+ atomic_inc(&leaf->refs);
btrfs_set_path_blocking(path);
wret = push_leaf_left(trans, root, path, 1, 1,
@@ -5126,7 +5121,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
struct btrfs_path *path,
u64 min_trans)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *cur;
struct btrfs_key found_key;
int slot;
@@ -5207,7 +5201,7 @@ find_next_key:
goto out;
}
btrfs_set_path_blocking(path);
- cur = read_node_slot(fs_info, cur, slot);
+ cur = btrfs_read_node_slot(cur, slot);
if (IS_ERR(cur)) {
ret = PTR_ERR(cur);
goto out;
@@ -5229,371 +5223,6 @@ out:
return ret;
}
-static int tree_move_down(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- int *level)
-{
- struct extent_buffer *eb;
-
- BUG_ON(*level == 0);
- eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
- if (IS_ERR(eb))
- return PTR_ERR(eb);
-
- path->nodes[*level - 1] = eb;
- path->slots[*level - 1] = 0;
- (*level)--;
- return 0;
-}
-
-static int tree_move_next_or_upnext(struct btrfs_path *path,
- int *level, int root_level)
-{
- int ret = 0;
- int nritems;
- nritems = btrfs_header_nritems(path->nodes[*level]);
-
- path->slots[*level]++;
-
- while (path->slots[*level] >= nritems) {
- if (*level == root_level)
- return -1;
-
- /* move upnext */
- path->slots[*level] = 0;
- free_extent_buffer(path->nodes[*level]);
- path->nodes[*level] = NULL;
- (*level)++;
- path->slots[*level]++;
-
- nritems = btrfs_header_nritems(path->nodes[*level]);
- ret = 1;
- }
- return ret;
-}
-
-/*
- * Returns 1 if it had to move up and next. 0 is returned if it moved only next
- * or down.
- */
-static int tree_advance(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- int *level, int root_level,
- int allow_down,
- struct btrfs_key *key)
-{
- int ret;
-
- if (*level == 0 || !allow_down) {
- ret = tree_move_next_or_upnext(path, level, root_level);
- } else {
- ret = tree_move_down(fs_info, path, level);
- }
- if (ret >= 0) {
- if (*level == 0)
- btrfs_item_key_to_cpu(path->nodes[*level], key,
- path->slots[*level]);
- else
- btrfs_node_key_to_cpu(path->nodes[*level], key,
- path->slots[*level]);
- }
- return ret;
-}
-
-static int tree_compare_item(struct btrfs_path *left_path,
- struct btrfs_path *right_path,
- char *tmp_buf)
-{
- int cmp;
- int len1, len2;
- unsigned long off1, off2;
-
- len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
- len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
- if (len1 != len2)
- return 1;
-
- off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
- off2 = btrfs_item_ptr_offset(right_path->nodes[0],
- right_path->slots[0]);
-
- read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
-
- cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
- if (cmp)
- return 1;
- return 0;
-}
-
-#define ADVANCE 1
-#define ADVANCE_ONLY_NEXT -1
-
-/*
- * This function compares two trees and calls the provided callback for
- * every changed/new/deleted item it finds.
- * If shared tree blocks are encountered, whole subtrees are skipped, making
- * the compare pretty fast on snapshotted subvolumes.
- *
- * This currently works on commit roots only. As commit roots are read only,
- * we don't do any locking. The commit roots are protected with transactions.
- * Transactions are ended and rejoined when a commit is tried in between.
- *
- * This function checks for modifications done to the trees while comparing.
- * If it detects a change, it aborts immediately.
- */
-int btrfs_compare_trees(struct btrfs_root *left_root,
- struct btrfs_root *right_root,
- btrfs_changed_cb_t changed_cb, void *ctx)
-{
- struct btrfs_fs_info *fs_info = left_root->fs_info;
- int ret;
- int cmp;
- struct btrfs_path *left_path = NULL;
- struct btrfs_path *right_path = NULL;
- struct btrfs_key left_key;
- struct btrfs_key right_key;
- char *tmp_buf = NULL;
- int left_root_level;
- int right_root_level;
- int left_level;
- int right_level;
- int left_end_reached;
- int right_end_reached;
- int advance_left;
- int advance_right;
- u64 left_blockptr;
- u64 right_blockptr;
- u64 left_gen;
- u64 right_gen;
-
- left_path = btrfs_alloc_path();
- if (!left_path) {
- ret = -ENOMEM;
- goto out;
- }
- right_path = btrfs_alloc_path();
- if (!right_path) {
- ret = -ENOMEM;
- goto out;
- }
-
- tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
- if (!tmp_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- left_path->search_commit_root = 1;
- left_path->skip_locking = 1;
- right_path->search_commit_root = 1;
- right_path->skip_locking = 1;
-
- /*
- * Strategy: Go to the first items of both trees. Then do
- *
- * If both trees are at level 0
- * Compare keys of current items
- * If left < right treat left item as new, advance left tree
- * and repeat
- * If left > right treat right item as deleted, advance right tree
- * and repeat
- * If left == right do deep compare of items, treat as changed if
- * needed, advance both trees and repeat
- * If both trees are at the same level but not at level 0
- * Compare keys of current nodes/leafs
- * If left < right advance left tree and repeat
- * If left > right advance right tree and repeat
- * If left == right compare blockptrs of the next nodes/leafs
- * If they match advance both trees but stay at the same level
- * and repeat
- * If they don't match advance both trees while allowing to go
- * deeper and repeat
- * If tree levels are different
- * Advance the tree that needs it and repeat
- *
- * Advancing a tree means:
- * If we are at level 0, try to go to the next slot. If that's not
- * possible, go one level up and repeat. Stop when we found a level
- * where we could go to the next slot. We may at this point be on a
- * node or a leaf.
- *
- * If we are not at level 0 and not on shared tree blocks, go one
- * level deeper.
- *
- * If we are not at level 0 and on shared tree blocks, go one slot to
- * the right if possible or go up and right.
- */
-
- down_read(&fs_info->commit_root_sem);
- left_level = btrfs_header_level(left_root->commit_root);
- left_root_level = left_level;
- left_path->nodes[left_level] =
- btrfs_clone_extent_buffer(left_root->commit_root);
- if (!left_path->nodes[left_level]) {
- up_read(&fs_info->commit_root_sem);
- ret = -ENOMEM;
- goto out;
- }
-
- right_level = btrfs_header_level(right_root->commit_root);
- right_root_level = right_level;
- right_path->nodes[right_level] =
- btrfs_clone_extent_buffer(right_root->commit_root);
- if (!right_path->nodes[right_level]) {
- up_read(&fs_info->commit_root_sem);
- ret = -ENOMEM;
- goto out;
- }
- up_read(&fs_info->commit_root_sem);
-
- if (left_level == 0)
- btrfs_item_key_to_cpu(left_path->nodes[left_level],
- &left_key, left_path->slots[left_level]);
- else
- btrfs_node_key_to_cpu(left_path->nodes[left_level],
- &left_key, left_path->slots[left_level]);
- if (right_level == 0)
- btrfs_item_key_to_cpu(right_path->nodes[right_level],
- &right_key, right_path->slots[right_level]);
- else
- btrfs_node_key_to_cpu(right_path->nodes[right_level],
- &right_key, right_path->slots[right_level]);
-
- left_end_reached = right_end_reached = 0;
- advance_left = advance_right = 0;
-
- while (1) {
- if (advance_left && !left_end_reached) {
- ret = tree_advance(fs_info, left_path, &left_level,
- left_root_level,
- advance_left != ADVANCE_ONLY_NEXT,
- &left_key);
- if (ret == -1)
- left_end_reached = ADVANCE;
- else if (ret < 0)
- goto out;
- advance_left = 0;
- }
- if (advance_right && !right_end_reached) {
- ret = tree_advance(fs_info, right_path, &right_level,
- right_root_level,
- advance_right != ADVANCE_ONLY_NEXT,
- &right_key);
- if (ret == -1)
- right_end_reached = ADVANCE;
- else if (ret < 0)
- goto out;
- advance_right = 0;
- }
-
- if (left_end_reached && right_end_reached) {
- ret = 0;
- goto out;
- } else if (left_end_reached) {
- if (right_level == 0) {
- ret = changed_cb(left_path, right_path,
- &right_key,
- BTRFS_COMPARE_TREE_DELETED,
- ctx);
- if (ret < 0)
- goto out;
- }
- advance_right = ADVANCE;
- continue;
- } else if (right_end_reached) {
- if (left_level == 0) {
- ret = changed_cb(left_path, right_path,
- &left_key,
- BTRFS_COMPARE_TREE_NEW,
- ctx);
- if (ret < 0)
- goto out;
- }
- advance_left = ADVANCE;
- continue;
- }
-
- if (left_level == 0 && right_level == 0) {
- cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
- if (cmp < 0) {
- ret = changed_cb(left_path, right_path,
- &left_key,
- BTRFS_COMPARE_TREE_NEW,
- ctx);
- if (ret < 0)
- goto out;
- advance_left = ADVANCE;
- } else if (cmp > 0) {
- ret = changed_cb(left_path, right_path,
- &right_key,
- BTRFS_COMPARE_TREE_DELETED,
- ctx);
- if (ret < 0)
- goto out;
- advance_right = ADVANCE;
- } else {
- enum btrfs_compare_tree_result result;
-
- WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
- ret = tree_compare_item(left_path, right_path,
- tmp_buf);
- if (ret)
- result = BTRFS_COMPARE_TREE_CHANGED;
- else
- result = BTRFS_COMPARE_TREE_SAME;
- ret = changed_cb(left_path, right_path,
- &left_key, result, ctx);
- if (ret < 0)
- goto out;
- advance_left = ADVANCE;
- advance_right = ADVANCE;
- }
- } else if (left_level == right_level) {
- cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
- if (cmp < 0) {
- advance_left = ADVANCE;
- } else if (cmp > 0) {
- advance_right = ADVANCE;
- } else {
- left_blockptr = btrfs_node_blockptr(
- left_path->nodes[left_level],
- left_path->slots[left_level]);
- right_blockptr = btrfs_node_blockptr(
- right_path->nodes[right_level],
- right_path->slots[right_level]);
- left_gen = btrfs_node_ptr_generation(
- left_path->nodes[left_level],
- left_path->slots[left_level]);
- right_gen = btrfs_node_ptr_generation(
- right_path->nodes[right_level],
- right_path->slots[right_level]);
- if (left_blockptr == right_blockptr &&
- left_gen == right_gen) {
- /*
- * As we're on a shared block, don't
- * allow to go deeper.
- */
- advance_left = ADVANCE_ONLY_NEXT;
- advance_right = ADVANCE_ONLY_NEXT;
- } else {
- advance_left = ADVANCE;
- advance_right = ADVANCE;
- }
- }
- } else if (left_level < right_level) {
- advance_right = ADVANCE;
- } else {
- advance_left = ADVANCE;
- }
- }
-
-out:
- btrfs_free_path(left_path);
- btrfs_free_path(right_path);
- kvfree(tmp_buf);
- return ret;
-}
-
/*
* this is similar to btrfs_next_leaf, but does not try to preserve
* and fixup the path. It looks for and returns the next key in the
@@ -5611,7 +5240,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
int slot;
struct extent_buffer *c;
- WARN_ON(!path->keep_locks);
+ WARN_ON(!path->keep_locks && !path->skip_locking);
while (level < BTRFS_MAX_LEVEL) {
if (!path->nodes[level])
return 1;
@@ -5627,7 +5256,7 @@ next:
!path->nodes[level + 1])
return 1;
- if (path->locks[level + 1]) {
+ if (path->locks[level + 1] || path->skip_locking) {
level++;
continue;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b3642367a595..54efb21c2727 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -16,9 +16,9 @@
#include <linux/backing-dev.h>
#include <linux/wait.h>
#include <linux/slab.h>
-#include <linux/kobject.h>
#include <trace/events/btrfs.h>
#include <asm/kmap_types.h>
+#include <asm/unaligned.h>
#include <linux/pagemap.h>
#include <linux/btrfs.h>
#include <linux/btrfs_tree.h>
@@ -28,23 +28,38 @@
#include <linux/dynamic_debug.h>
#include <linux/refcount.h>
#include <linux/crc32c.h>
+#include "extent-io-tree.h"
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
+#include "block-rsv.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
struct btrfs_pending_snapshot;
struct btrfs_delayed_ref_root;
+struct btrfs_space_info;
+struct btrfs_block_group;
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
+extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
struct btrfs_ordered_sum;
+struct btrfs_ref;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
-#define BTRFS_MAX_MIRRORS 3
+/*
+ * Maximum number of mirrors that can be available for all profiles counting
+ * the target device of dev-replace as one. During an active device replace
+ * procedure, the target device of the copy operation is a mirror for the
+ * filesystem data as well that can be used to read data in order to repair
+ * read errors on other disks.
+ *
+ * Current value is derived from RAID1C4 with 4 copies.
+ */
+#define BTRFS_MAX_MIRRORS (4 + 1)
#define BTRFS_MAX_LEVEL 8
@@ -69,9 +84,6 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_LINK_MAX 65535U
-/* four bytes for CRC32 */
-static const int btrfs_csum_sizes[] = { 4 };
-
#define BTRFS_EMPTY_DIR_SIZE 0
/* ioprio of readahead is set to idle */
@@ -98,10 +110,6 @@ static inline u32 count_max_extents(u64 size)
return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
}
-struct btrfs_mapping_tree {
- struct extent_map_tree map_tree;
-};
-
static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
BUG_ON(num_stripes == 0);
@@ -284,7 +292,8 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
- BTRFS_FEATURE_INCOMPAT_METADATA_UUID)
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34)
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -387,122 +396,6 @@ struct btrfs_dev_replace {
wait_queue_head_t replace_wait;
};
-/* For raid type sysfs entries */
-struct raid_kobject {
- u64 flags;
- struct kobject kobj;
- struct list_head list;
-};
-
-struct btrfs_space_info {
- spinlock_t lock;
-
- u64 total_bytes; /* total bytes in the space,
- this doesn't take mirrors into account */
- u64 bytes_used; /* total bytes used,
- this doesn't take mirrors into account */
- u64 bytes_pinned; /* total bytes pinned, will be freed when the
- transaction finishes */
- u64 bytes_reserved; /* total bytes the allocator has reserved for
- current allocations */
- u64 bytes_may_use; /* number of bytes that may be used for
- delalloc/allocations */
- u64 bytes_readonly; /* total bytes that are read only */
-
- u64 max_extent_size; /* This will hold the maximum extent size of
- the space info if we had an ENOSPC in the
- allocator. */
-
- unsigned int full:1; /* indicates that we cannot allocate any more
- chunks for this space */
- unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
-
- unsigned int flush:1; /* set if we are trying to make space */
-
- unsigned int force_alloc; /* set if we need to force a chunk
- alloc for this space */
-
- u64 disk_used; /* total bytes used on disk */
- u64 disk_total; /* total bytes on disk, takes mirrors into
- account */
-
- u64 flags;
-
- /*
- * bytes_pinned is kept in line with what is actually pinned, as in
- * we've called update_block_group and dropped the bytes_used counter
- * and increased the bytes_pinned counter. However this means that
- * bytes_pinned does not reflect the bytes that will be pinned once the
- * delayed refs are flushed, so this counter is inc'ed every time we
- * call btrfs_free_extent so it is a realtime count of what will be
- * freed once the transaction is committed. It will be zeroed every
- * time the transaction commits.
- */
- struct percpu_counter total_bytes_pinned;
-
- struct list_head list;
- /* Protected by the spinlock 'lock'. */
- struct list_head ro_bgs;
- struct list_head priority_tickets;
- struct list_head tickets;
- /*
- * tickets_id just indicates the next ticket will be handled, so note
- * it's not stored per ticket.
- */
- u64 tickets_id;
-
- struct rw_semaphore groups_sem;
- /* for block groups in our same type */
- struct list_head block_groups[BTRFS_NR_RAID_TYPES];
- wait_queue_head_t wait;
-
- struct kobject kobj;
- struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
-};
-
-/*
- * Types of block reserves
- */
-enum {
- BTRFS_BLOCK_RSV_GLOBAL,
- BTRFS_BLOCK_RSV_DELALLOC,
- BTRFS_BLOCK_RSV_TRANS,
- BTRFS_BLOCK_RSV_CHUNK,
- BTRFS_BLOCK_RSV_DELOPS,
- BTRFS_BLOCK_RSV_DELREFS,
- BTRFS_BLOCK_RSV_EMPTY,
- BTRFS_BLOCK_RSV_TEMP,
-};
-
-struct btrfs_block_rsv {
- u64 size;
- u64 reserved;
- struct btrfs_space_info *space_info;
- spinlock_t lock;
- unsigned short full;
- unsigned short type;
- unsigned short failfast;
-
- /*
- * Qgroup equivalent for @size @reserved
- *
- * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care
- * about things like csum size nor how many tree blocks it will need to
- * reserve.
- *
- * Qgroup cares more about net change of the extent usage.
- *
- * So for one newly inserted file extent, in worst case it will cause
- * leaf split and level increase, nodesize for each file extent is
- * already too much.
- *
- * In short, qgroup_size/reserved is the upper limit of possible needed
- * qgroup metadata reservation.
- */
- u64 qgroup_rsv_size;
- u64 qgroup_rsv_reserved;
-};
-
/*
* free clusters are used to claim free space in relatively large chunks,
* allowing us to do less seeky writes. They are used for all metadata
@@ -522,7 +415,7 @@ struct btrfs_free_cluster {
/* We did a full search and couldn't create a cluster */
bool fragmented;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
/*
* when a cluster is allocated from a block group, we put the
* cluster onto a list in the block group so that it can
@@ -539,40 +432,6 @@ enum btrfs_caching_type {
BTRFS_CACHE_ERROR,
};
-enum btrfs_disk_cache_state {
- BTRFS_DC_WRITTEN,
- BTRFS_DC_ERROR,
- BTRFS_DC_CLEAR,
- BTRFS_DC_SETUP,
-};
-
-struct btrfs_caching_control {
- struct list_head list;
- struct mutex mutex;
- wait_queue_head_t wait;
- struct btrfs_work work;
- struct btrfs_block_group_cache *block_group;
- u64 progress;
- refcount_t count;
-};
-
-/* Once caching_thread() finds this much free space, it will wake up waiters. */
-#define CACHING_CTL_WAKE_UP SZ_2M
-
-struct btrfs_io_ctl {
- void *cur, *orig;
- struct page *page;
- struct page **pages;
- struct btrfs_fs_info *fs_info;
- struct inode *inode;
- unsigned long size;
- int index;
- int num_pages;
- int entries;
- int bitmaps;
- unsigned check_crcs:1;
-};
-
/*
* Tree to record all locked full stripes of a RAID5/6 block group
*/
@@ -581,120 +440,6 @@ struct btrfs_full_stripe_locks_tree {
struct mutex lock;
};
-struct btrfs_block_group_cache {
- struct btrfs_key key;
- struct btrfs_block_group_item item;
- struct btrfs_fs_info *fs_info;
- struct inode *inode;
- spinlock_t lock;
- u64 pinned;
- u64 reserved;
- u64 delalloc_bytes;
- u64 bytes_super;
- u64 flags;
- u64 cache_generation;
-
- /*
- * If the free space extent count exceeds this number, convert the block
- * group to bitmaps.
- */
- u32 bitmap_high_thresh;
-
- /*
- * If the free space extent count drops below this number, convert the
- * block group back to extents.
- */
- u32 bitmap_low_thresh;
-
- /*
- * It is just used for the delayed data space allocation because
- * only the data space allocation and the relative metadata update
- * can be done cross the transaction.
- */
- struct rw_semaphore data_rwsem;
-
- /* for raid56, this is a full stripe, without parity */
- unsigned long full_stripe_len;
-
- unsigned int ro;
- unsigned int iref:1;
- unsigned int has_caching_ctl:1;
- unsigned int removed:1;
-
- int disk_cache_state;
-
- /* cache tracking stuff */
- int cached;
- struct btrfs_caching_control *caching_ctl;
- u64 last_byte_to_unpin;
-
- struct btrfs_space_info *space_info;
-
- /* free space cache stuff */
- struct btrfs_free_space_ctl *free_space_ctl;
-
- /* block group cache stuff */
- struct rb_node cache_node;
-
- /* for block groups in the same raid type */
- struct list_head list;
-
- /* usage count */
- atomic_t count;
-
- /* List of struct btrfs_free_clusters for this block group.
- * Today it will only have one thing on it, but that may change
- */
- struct list_head cluster_list;
-
- /* For delayed block group creation or deletion of empty block groups */
- struct list_head bg_list;
-
- /* For read-only block groups */
- struct list_head ro_list;
-
- atomic_t trimming;
-
- /* For dirty block groups */
- struct list_head dirty_list;
- struct list_head io_list;
-
- struct btrfs_io_ctl io_ctl;
-
- /*
- * Incremented when doing extent allocations and holding a read lock
- * on the space_info's groups_sem semaphore.
- * Decremented when an ordered extent that represents an IO against this
- * block group's range is created (after it's added to its inode's
- * root's list of ordered extents) or immediately after the allocation
- * if it's a metadata extent or fallocate extent (for these cases we
- * don't create ordered extents).
- */
- atomic_t reservations;
-
- /*
- * Incremented while holding the spinlock *lock* by a task checking if
- * it can perform a nocow write (incremented if the value for the *ro*
- * field is 0). Decremented by such tasks once they create an ordered
- * extent or before that if some error happens before reaching that step.
- * This is to prevent races between block group relocation and nocow
- * writes through direct IO.
- */
- atomic_t nocow_writers;
-
- /* Lock for free space tree operations. */
- struct mutex free_space_lock;
-
- /*
- * Does the block group need to be added to the free space tree?
- * Protected by free_space_lock.
- */
- int needs_free_space;
-
- /* Record locked full stripes for RAID5/6 block group */
- struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
-};
-
/* delayed seq elem */
struct seq_list {
struct list_head list;
@@ -710,22 +455,6 @@ enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_DONE = 2,
};
-/* used by the raid56 code to lock stripes for read/modify/write */
-struct btrfs_stripe_hash {
- struct list_head hash_list;
- spinlock_t lock;
-};
-
-/* used by the raid56 code to lock stripes for read/modify/write */
-struct btrfs_stripe_hash_table {
- struct list_head stripe_cache;
- spinlock_t cache_lock;
- int cache_size;
- struct btrfs_stripe_hash table[];
-};
-
-#define BTRFS_STRIPE_HASH_TABLE_BITS 11
-
void btrfs_init_async_reclaim_work(struct work_struct *work);
/* fs_info */
@@ -749,8 +478,8 @@ struct btrfs_swapfile_pin {
void *ptr;
struct inode *inode;
/*
- * If true, ptr points to a struct btrfs_block_group_cache. Otherwise,
- * ptr points to a struct btrfs_device.
+ * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
+ * points to a struct btrfs_device.
*/
bool is_block_group;
};
@@ -785,11 +514,18 @@ enum {
/*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
+ * Set and cleared while holding fs_info::balance_mutex.
*/
BTRFS_FS_BALANCE_RUNNING,
/* Indicate that the cleaner thread is awake and doing something. */
BTRFS_FS_CLEANER_RUNNING,
+
+ /*
+ * The checksumming has an optimized version and is considered fast,
+ * so we don't need to offload checksums to workqueues.
+ */
+ BTRFS_FS_CSUM_IMPL_FAST,
};
struct btrfs_fs_info {
@@ -823,7 +559,7 @@ struct btrfs_fs_info {
struct extent_io_tree *pinned_extents;
/* logical->physical extent mapping */
- struct btrfs_mapping_tree mapping_tree;
+ struct extent_map_tree mapping_tree;
/*
* block reservation for extent, checksum, root tree and
@@ -988,7 +724,6 @@ struct btrfs_fs_info {
struct btrfs_workqueue *endio_meta_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
- struct btrfs_workqueue *submit_workers;
struct btrfs_workqueue *caching_workers;
struct btrfs_workqueue *readahead_workers;
@@ -1000,21 +735,18 @@ struct btrfs_fs_info {
struct btrfs_workqueue *fixup_workers;
struct btrfs_workqueue *delayed_workers;
- /* the extent workers do delayed refs on the extent allocation tree */
- struct btrfs_workqueue *extent_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
u32 thread_pool_size;
struct kobject *space_info_kobj;
- struct list_head pending_raid_kobjs;
- spinlock_t pending_raid_kobjs_lock; /* uncontended */
u64 total_pinned;
/* used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
struct percpu_counter delalloc_bytes;
+ struct percpu_counter dio_bytes;
s32 dirty_metadata_batch;
s32 delalloc_batch;
@@ -1092,10 +824,7 @@ struct btrfs_fs_info {
/* holds configuration and tracking. Protected by qgroup_lock */
struct rb_root qgroup_tree;
- struct rb_root qgroup_op_tree;
spinlock_t qgroup_lock;
- spinlock_t qgroup_op_lock;
- atomic_t qgroup_op_seq;
/*
* used to avoid frequently calling ulist_alloc()/ulist_free()
@@ -1152,12 +881,6 @@ struct btrfs_fs_info {
struct mutex unused_bg_unpin_mutex;
struct mutex delete_unused_bgs_mutex;
- /*
- * Chunks that can't be freed yet (under a trim/discard operation)
- * and will be latter freed. Protected by fs_info->chunk_mutex.
- */
- struct list_head pinned_chunks;
-
/* Cached block sizes */
u32 nodesize;
u32 sectorsize;
@@ -1167,6 +890,14 @@ struct btrfs_fs_info {
spinlock_t swapfile_pins_lock;
struct rb_root swapfile_pins;
+ struct crypto_shash *csum_shash;
+
+ /*
+ * Number of send operations in progress.
+ * Updated while holding fs_info::balance_mutex.
+ */
+ int send_in_progress;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1348,6 +1079,12 @@ struct btrfs_root {
* manipulation with the read-only status via SUBVOL_SETFLAGS
*/
int send_in_progress;
+ /*
+ * Number of currently running deduplication operations that have a
+ * destination inode belonging to this root. Protected by the lock
+ * root_item_lock.
+ */
+ int dedupe_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshotted;
atomic_t snapshot_force_cow;
@@ -1368,6 +1105,16 @@ struct btrfs_root {
#endif
};
+struct btrfs_clone_extent_info {
+ u64 disk_offset;
+ u64 disk_len;
+ u64 data_offset;
+ u64 data_len;
+ u64 file_offset;
+ char *extent_buf;
+ u32 item_size;
+};
+
struct btrfs_file_private {
void *filldir_buf;
};
@@ -1466,19 +1213,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
btrfs_clear_opt(fs_info->mount_opt, opt); \
}
-#ifdef CONFIG_BTRFS_DEBUG
-static inline int
-btrfs_should_fragment_free_space(struct btrfs_block_group_cache *block_group)
-{
- struct btrfs_fs_info *fs_info = block_group->fs_info;
-
- return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
- block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
- (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
- block_group->flags & BTRFS_BLOCK_GROUP_DATA);
-}
-#endif
-
/*
* Requests for changes that need to be done during transaction commit.
*
@@ -1540,6 +1274,21 @@ do { \
#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
+#define BTRFS_INODE_FLAG_MASK \
+ (BTRFS_INODE_NODATASUM | \
+ BTRFS_INODE_NODATACOW | \
+ BTRFS_INODE_READONLY | \
+ BTRFS_INODE_NOCOMPRESS | \
+ BTRFS_INODE_PREALLOC | \
+ BTRFS_INODE_SYNC | \
+ BTRFS_INODE_IMMUTABLE | \
+ BTRFS_INODE_APPEND | \
+ BTRFS_INODE_NODUMP | \
+ BTRFS_INODE_NOATIME | \
+ BTRFS_INODE_DIRSYNC | \
+ BTRFS_INODE_COMPRESS | \
+ BTRFS_INODE_ROOT_ITEM_INIT)
+
struct btrfs_map_token {
const struct extent_buffer *eb;
char *kaddr;
@@ -1549,8 +1298,10 @@ struct btrfs_map_token {
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
((bytes) >> (fs_info)->sb->s_blocksize_bits)
-static inline void btrfs_init_map_token (struct btrfs_map_token *token)
+static inline void btrfs_init_map_token(struct btrfs_map_token *token,
+ struct extent_buffer *eb)
{
+ token->eb = eb;
token->kaddr = NULL;
}
@@ -1581,17 +1332,10 @@ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \
void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr, \
unsigned long off, u##bits val, \
struct btrfs_map_token *token); \
-static inline u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
- const void *ptr, \
- unsigned long off) \
-{ \
- return btrfs_get_token_##bits(eb, ptr, off, NULL); \
-} \
-static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,\
- unsigned long off, u##bits val) \
-{ \
- btrfs_set_token_##bits(eb, ptr, off, val, NULL); \
-}
+u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
+ const void *ptr, unsigned long off); \
+void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off, u##bits val);
DECLARE_BTRFS_SETGET_BITS(8)
DECLARE_BTRFS_SETGET_BITS(16)
@@ -1776,18 +1520,18 @@ static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
}
/* struct btrfs_block_group_item */
-BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item,
used, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
+BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item,
used, 64);
-BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid,
struct btrfs_block_group_item, chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
+BTRFS_SETGET_FUNCS(block_group_chunk_objectid,
struct btrfs_block_group_item, chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_flags,
+BTRFS_SETGET_FUNCS(block_group_flags,
struct btrfs_block_group_item, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
struct btrfs_block_group_item, flags, 64);
/* struct btrfs_free_space_info */
@@ -2133,16 +1877,6 @@ static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
btrfs_disk_key_to_cpu(key, &disk_key);
}
-static inline u8 btrfs_key_type(const struct btrfs_key *key)
-{
- return key->type;
-}
-
-static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
-{
- key->type = val;
-}
-
/* struct btrfs_header */
BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
@@ -2163,18 +1897,16 @@ static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
return (btrfs_header_flags(eb) & flag) == flag;
}
-static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
{
u64 flags = btrfs_header_flags(eb);
btrfs_set_header_flags(eb, flags | flag);
- return (flags & flag) == flag;
}
-static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
{
u64 flags = btrfs_header_flags(eb);
btrfs_set_header_flags(eb, flags & ~flag);
- return (flags & flag) == flag;
}
static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
@@ -2430,14 +2162,10 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
uuid_tree_generation, 64);
-static inline int btrfs_super_csum_size(const struct btrfs_super_block *s)
-{
- u16 t = btrfs_super_csum_type(s);
- /*
- * csum type is validated at mount time
- */
- return btrfs_csum_sizes[t];
-}
+int btrfs_super_csum_size(const struct btrfs_super_block *s);
+const char *btrfs_super_csum_name(u16 csum_type);
+const char *btrfs_super_csum_driver(u16 csum_type);
+size_t __const btrfs_get_num_csums(void);
/*
@@ -2445,13 +2173,12 @@ static inline int btrfs_super_csum_size(const struct btrfs_super_block *s)
* this returns the address of the start of the last item,
* which is the stop of the leaf data stack
*/
-static inline unsigned int leaf_data_end(const struct btrfs_fs_info *fs_info,
- const struct extent_buffer *leaf)
+static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
{
u32 nr = btrfs_header_nritems(leaf);
if (nr == 0)
- return BTRFS_LEAF_DATA_SIZE(fs_info);
+ return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
return btrfs_item_offset_nr(leaf, nr - 1);
}
@@ -2512,30 +2239,6 @@ static inline u32 btrfs_file_extent_inline_item_len(
return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
-/* btrfs_dev_stats_item */
-static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
- const struct btrfs_dev_stats_item *ptr,
- int index)
-{
- u64 val;
-
- read_extent_buffer(eb, &val,
- offsetof(struct btrfs_dev_stats_item, values) +
- ((unsigned long)ptr) + (index * sizeof(u64)),
- sizeof(val));
- return val;
-}
-
-static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
- struct btrfs_dev_stats_item *ptr,
- int index, u64 val)
-{
- write_extent_buffer(eb, &val,
- offsetof(struct btrfs_dev_stats_item, values) +
- ((unsigned long)ptr) + (index * sizeof(u64)),
- sizeof(val));
-}
-
/* btrfs_qgroup_status_item */
BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
generation, 64);
@@ -2631,6 +2334,16 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
btrfs_item_offset_nr(leaf, slot)))
+static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
+{
+ return crc32c(crc, address, length);
+}
+
+static inline void btrfs_crc32c_final(u32 crc, u8 *result)
+{
+ put_unaligned_le32(~crc, result);
+}
+
static inline u64 btrfs_name_hash(const char *name, int len)
{
return crc32c((u32)~1, name, len);
@@ -2645,12 +2358,6 @@ static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
return (u64) crc32c(parent_objectid, name, len);
}
-static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
-{
- return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
- (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
-}
-
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
return mapping_gfp_constraint(mapping, ~__GFP_FS);
@@ -2668,38 +2375,35 @@ enum btrfs_inline_ref_type {
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data);
+u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes);
-static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
+/*
+ * Use this if we would be adding new items, as we could split nodes as we cow
+ * down the tree.
+ */
+static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
{
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
}
/*
- * Doing a truncate won't result in new nodes or leaves, just what we need for
- * COW.
+ * Doing a truncate or a modification won't result in new nodes or leaves, just
+ * what we need for COW.
*/
-static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
+static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
unsigned num_items)
{
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
}
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
-bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
-void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
- const u64 start);
-void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 num_bytes);
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
unsigned long count);
-int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
- unsigned long count, u64 transid, int wait);
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
@@ -2711,15 +2415,9 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num, int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes);
-int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb);
+int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr);
-struct btrfs_block_group_cache *btrfs_lookup_block_group(
- struct btrfs_fs_info *info,
- u64 bytenr);
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@@ -2745,13 +2443,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 flags,
int level, int is_data);
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset);
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
@@ -2760,35 +2454,11 @@ int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset);
-
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+ struct btrfs_ref *generic_ref);
+
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
-int btrfs_free_block_groups(struct btrfs_fs_info *info);
-int btrfs_read_block_groups(struct btrfs_fs_info *info);
-int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
-int btrfs_make_block_group(struct btrfs_trans_handle *trans,
- u64 bytes_used, u64 type, u64 chunk_offset,
- u64 size);
-void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info);
-struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
- struct btrfs_fs_info *fs_info,
- const u64 chunk_offset);
-int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em);
-void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
-void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
-void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
-u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info);
-u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info);
-u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info);
+void btrfs_get_block_group_trimming(struct btrfs_block_group *cache);
+void btrfs_put_block_group_trimming(struct btrfs_block_group *cache);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
enum btrfs_reserve_flush_enum {
@@ -2799,6 +2469,7 @@ enum btrfs_reserve_flush_enum {
* case, use FLUSH LIMIT
*/
BTRFS_RESERVE_FLUSH_LIMIT,
+ BTRFS_RESERVE_FLUSH_EVICT,
BTRFS_RESERVE_FLUSH_ALL,
};
@@ -2811,73 +2482,23 @@ enum btrfs_flush_state {
FLUSH_DELALLOC_WAIT = 6,
ALLOC_CHUNK = 7,
ALLOC_CHUNK_FORCE = 8,
- COMMIT_TRANS = 9,
+ RUN_DELAYED_IPUTS = 9,
+ COMMIT_TRANS = 10,
};
-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
-int btrfs_check_data_free_space(struct inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len);
-void btrfs_free_reserved_data_space(struct inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len);
-void btrfs_delalloc_release_space(struct inode *inode,
- struct extent_changeset *reserved,
- u64 start, u64 len, bool qgroup_free);
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
- u64 len);
-void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
- bool qgroup_free);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
- bool qgroup_free);
-int btrfs_delalloc_reserve_space(struct inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len);
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
- unsigned short type);
-void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *rsv,
- unsigned short type);
-void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, u64 num_bytes,
- enum btrfs_reserve_flush_enum flush);
-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
-int btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, u64 min_reserved,
- enum btrfs_reserve_flush_enum flush);
-int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
- struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
- bool update_size);
-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *dest, u64 num_bytes,
- int min_factor);
-void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes);
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
-void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
-int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
- enum btrfs_reserve_flush_enum flush);
-void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *src,
- u64 num_bytes);
-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
-void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
-void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes);
-int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
@@ -2886,15 +2507,11 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
-void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
-u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- u64 start, u64 end);
-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
int level, int *slot);
-int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
+int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
@@ -2912,20 +2529,9 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
struct btrfs_path *path,
u64 min_trans);
-enum btrfs_compare_tree_result {
- BTRFS_COMPARE_TREE_NEW,
- BTRFS_COMPARE_TREE_DELETED,
- BTRFS_COMPARE_TREE_CHANGED,
- BTRFS_COMPARE_TREE_SAME,
-};
-typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
- struct btrfs_path *right_path,
- struct btrfs_key *key,
- enum btrfs_compare_tree_result result,
- void *ctx);
-int btrfs_compare_trees(struct btrfs_root *left_root,
- struct btrfs_root *right_root,
- btrfs_changed_cb_t cb, void *ctx);
+struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
+ int slot);
+
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
@@ -2936,10 +2542,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct extent_buffer **cow_ret, u64 new_root_objectid);
int btrfs_block_can_be_shared(struct btrfs_root *root,
struct extent_buffer *buf);
-void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- u32 data_size);
-void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, u32 new_size, int from_end);
+void btrfs_extend_item(struct btrfs_path *path, u32 data_size);
+void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -2967,8 +2571,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
-void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int slot, int nr);
@@ -3015,8 +2617,7 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
{
return btrfs_next_old_item(root, p, 0);
}
-int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf);
+int btrfs_leaf_free_space(struct extent_buffer *leaf);
int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
int update_ref, int for_reloc);
@@ -3177,19 +2778,18 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
u64 inode_objectid, u64 ref_objectid, int ins_len,
int cow);
-int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot,
- const char *name,
- int name_len, struct btrfs_inode_ref **ref_ret);
-int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot,
- u64 ref_objectid, const char *name,
- int name_len,
- struct btrfs_inode_extref **extref_ret);
-
+struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+ int slot, const char *name,
+ int name_len);
+struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
+ struct extent_buffer *leaf, int slot, u64 ref_objectid,
+ const char *name, int name_len);
/* file-item.c */
struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr, u64 len);
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
+ struct btrfs_root *root, u64 bytenr, u64 len);
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
+ u8 *dst);
blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
u64 logical_offset);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
@@ -3245,7 +2845,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
unsigned int extra_bits,
- struct extent_state **cached_state, int dedupe);
+ struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
struct btrfs_root *parent_root,
@@ -3267,14 +2867,14 @@ void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
+void btrfs_free_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new,
- struct btrfs_path *path);
+ struct btrfs_root *root, struct btrfs_path *path);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *was_new);
+ struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
u64 start, u64 end, int create);
@@ -3310,7 +2910,7 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
-int btrfs_is_empty_uuid(u8 *uuid);
+int __pure btrfs_is_empty_uuid(u8 *uuid);
int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_pages);
@@ -3340,6 +2940,10 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode, u64 start,
u64 end, int drop_cache);
+int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+ const u64 start, const u64 end,
+ struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
@@ -3355,12 +2959,6 @@ loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-/* sysfs.c */
-int __init btrfs_init_sysfs(void);
-void __cold btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
-void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
-
/* super.c */
int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags);
@@ -3516,8 +3114,7 @@ __cold
static inline void assfail(const char *expr, const char *file, int line)
{
if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
- pr_err("assertion failed: %s, file: %s, line: %d\n",
- expr, file, line);
+ pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
BUG();
}
}
@@ -3547,7 +3144,7 @@ __cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
-const char *btrfs_decode_error(int errno);
+const char * __attribute_const__ btrfs_decode_error(int errno);
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
@@ -3601,10 +3198,11 @@ do { \
/* compatibility and incompatibility defines */
#define btrfs_set_fs_incompat(__fs_info, opt) \
- __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
+ #opt)
static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
- u64 flag)
+ u64 flag, const char* name)
{
struct btrfs_super_block *disk_super;
u64 features;
@@ -3617,18 +3215,20 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
if (!(features & flag)) {
features |= flag;
btrfs_set_super_incompat_flags(disk_super, features);
- btrfs_info(fs_info, "setting %llu feature flag",
- flag);
+ btrfs_info(fs_info,
+ "setting incompat feature flag for %s (0x%llx)",
+ name, flag);
}
spin_unlock(&fs_info->super_lock);
}
}
#define btrfs_clear_fs_incompat(__fs_info, opt) \
- __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+ __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
+ #opt)
static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
- u64 flag)
+ u64 flag, const char* name)
{
struct btrfs_super_block *disk_super;
u64 features;
@@ -3641,8 +3241,9 @@ static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
if (features & flag) {
features &= ~flag;
btrfs_set_super_incompat_flags(disk_super, features);
- btrfs_info(fs_info, "clearing %llu feature flag",
- flag);
+ btrfs_info(fs_info,
+ "clearing incompat feature flag for %s (0x%llx)",
+ name, flag);
}
spin_unlock(&fs_info->super_lock);
}
@@ -3659,10 +3260,11 @@ static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
}
#define btrfs_set_fs_compat_ro(__fs_info, opt) \
- __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+ __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
+ #opt)
static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
- u64 flag)
+ u64 flag, const char *name)
{
struct btrfs_super_block *disk_super;
u64 features;
@@ -3675,18 +3277,20 @@ static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
if (!(features & flag)) {
features |= flag;
btrfs_set_super_compat_ro_flags(disk_super, features);
- btrfs_info(fs_info, "setting %llu ro feature flag",
- flag);
+ btrfs_info(fs_info,
+ "setting compat-ro feature flag for %s (0x%llx)",
+ name, flag);
}
spin_unlock(&fs_info->super_lock);
}
}
#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
- __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+ __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
+ #opt)
static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
- u64 flag)
+ u64 flag, const char *name)
{
struct btrfs_super_block *disk_super;
u64 features;
@@ -3699,8 +3303,9 @@ static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
if (features & flag) {
features &= ~flag;
btrfs_set_super_compat_ro_flags(disk_super, features);
- btrfs_info(fs_info, "clearing %llu ro feature flag",
- flag);
+ btrfs_info(fs_info,
+ "clearing compat-ro feature flag for %s (0x%llx)",
+ name, flag);
}
spin_unlock(&fs_info->super_lock);
}
@@ -3755,8 +3360,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
int btrfs_scrub_cancel(struct btrfs_fs_info *info);
-int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
- struct btrfs_device *dev);
+int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress);
static inline void btrfs_init_full_stripe_locks_tree(
@@ -3805,6 +3409,8 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
return signal_pending(current);
}
+#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_inode_set_ops(struct inode *inode);
@@ -3821,26 +3427,4 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
}
#endif
-static inline void cond_wake_up(struct wait_queue_head *wq)
-{
- /*
- * This implies a full smp_mb barrier, see comments for
- * waitqueue_active why.
- */
- if (wq_has_sleeper(wq))
- wake_up(wq);
-}
-
-static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
-{
- /*
- * Special case for conditional wakeup where the barrier required for
- * waitqueue_active is implied by some of the preceding code. Eg. one
- * of such atomic operations (atomic_dec_and_return, ...), or a
- * unlock/lock sequence, etc.
- */
- if (waitqueue_active(wq))
- wake_up(wq);
-}
-
#endif
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
deleted file mode 100644
index 90281a7a35a8..000000000000
--- a/fs/btrfs/dedupe.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2016 Fujitsu. All rights reserved.
- */
-
-#ifndef BTRFS_DEDUPE_H
-#define BTRFS_DEDUPE_H
-
-/* later in-band dedupe will expand this struct */
-struct btrfs_dedupe_hash;
-
-#endif
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
new file mode 100644
index 000000000000..4cdac4d834f5
--- /dev/null
+++ b/fs/btrfs/delalloc-space.c
@@ -0,0 +1,490 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "ctree.h"
+#include "delalloc-space.h"
+#include "block-rsv.h"
+#include "btrfs_inode.h"
+#include "space-info.h"
+#include "transaction.h"
+#include "qgroup.h"
+#include "block-group.h"
+
+int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ u64 used;
+ int ret = 0;
+ int need_commit = 2;
+ int have_pinned_space;
+
+ /* Make sure bytes are sectorsize aligned */
+ bytes = ALIGN(bytes, fs_info->sectorsize);
+
+ if (btrfs_is_free_space_inode(inode)) {
+ need_commit = 0;
+ ASSERT(current->journal_info);
+ }
+
+again:
+ /* Make sure we have enough space to handle the data first */
+ spin_lock(&data_sinfo->lock);
+ used = btrfs_space_info_used(data_sinfo, true);
+
+ if (used + bytes > data_sinfo->total_bytes) {
+ struct btrfs_trans_handle *trans;
+
+ /*
+ * If we don't have enough free bytes in this space then we need
+ * to alloc a new chunk.
+ */
+ if (!data_sinfo->full) {
+ u64 alloc_target;
+
+ data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
+ spin_unlock(&data_sinfo->lock);
+
+ alloc_target = btrfs_data_alloc_profile(fs_info);
+ /*
+ * It is ugly that we don't call nolock join
+ * transaction for the free space inode case here.
+ * But it is safe because we only do the data space
+ * reservation for the free space cache in the
+ * transaction context, the common join transaction
+ * just increase the counter of the current transaction
+ * handler, doesn't try to acquire the trans_lock of
+ * the fs.
+ */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_chunk_alloc(trans, alloc_target,
+ CHUNK_ALLOC_NO_FORCE);
+ btrfs_end_transaction(trans);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ return ret;
+ else {
+ have_pinned_space = 1;
+ goto commit_trans;
+ }
+ }
+
+ goto again;
+ }
+
+ /*
+ * If we don't have enough pinned space to deal with this
+ * allocation, and no removed chunk in current transaction,
+ * don't bother committing the transaction.
+ */
+ have_pinned_space = __percpu_counter_compare(
+ &data_sinfo->total_bytes_pinned,
+ used + bytes - data_sinfo->total_bytes,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ spin_unlock(&data_sinfo->lock);
+
+ /* Commit the current transaction and try again */
+commit_trans:
+ if (need_commit) {
+ need_commit--;
+
+ if (need_commit > 0) {
+ btrfs_start_delalloc_roots(fs_info, -1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
+ (u64)-1);
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ if (have_pinned_space >= 0 ||
+ test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+ &trans->transaction->flags) ||
+ need_commit > 0) {
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ return ret;
+ /*
+ * The cleaner kthread might still be doing iput
+ * operations. Wait for it to finish so that
+ * more space is released. We don't need to
+ * explicitly run the delayed iputs here because
+ * the commit_transaction would have woken up
+ * the cleaner.
+ */
+ ret = btrfs_wait_on_delayed_iputs(fs_info);
+ if (ret)
+ return ret;
+ goto again;
+ } else {
+ btrfs_end_transaction(trans);
+ }
+ }
+
+ trace_btrfs_space_reservation(fs_info,
+ "space_info:enospc",
+ data_sinfo->flags, bytes, 1);
+ return -ENOSPC;
+ }
+ btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
+ spin_unlock(&data_sinfo->lock);
+
+ return 0;
+}
+
+int btrfs_check_data_free_space(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ int ret;
+
+ /* align the range */
+ len = round_up(start + len, fs_info->sectorsize) -
+ round_down(start, fs_info->sectorsize);
+ start = round_down(start, fs_info->sectorsize);
+
+ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
+ if (ret < 0)
+ return ret;
+
+ /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
+ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
+ if (ret < 0)
+ btrfs_free_reserved_data_space_noquota(inode, start, len);
+ else
+ ret = 0;
+ return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
+ */
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ u64 len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_space_info *data_sinfo;
+
+ /* Make sure the range is aligned to sectorsize */
+ len = round_up(start + len, fs_info->sectorsize) -
+ round_down(start, fs_info->sectorsize);
+ start = round_down(start, fs_info->sectorsize);
+
+ data_sinfo = fs_info->data_sinfo;
+ spin_lock(&data_sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
+ spin_unlock(&data_sinfo->lock);
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-inode data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /* Make sure the range is aligned to sectorsize */
+ len = round_up(start + len, root->fs_info->sectorsize) -
+ round_down(start, root->fs_info->sectorsize);
+ start = round_down(start, root->fs_info->sectorsize);
+
+ btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_qgroup_free_data(inode, reserved, start, len);
+}
+
+/**
+ * btrfs_inode_rsv_release - release any excessive reservation.
+ * @inode - the inode we need to release from.
+ * @qgroup_free - free or convert qgroup meta.
+ * Unlike normal operation, qgroup meta reservation needs to know if we are
+ * freeing qgroup reservation or just converting it into per-trans. Normally
+ * @qgroup_free is true for error handling, and false for normal release.
+ *
+ * This is the same as btrfs_block_rsv_release, except that it handles the
+ * tracepoint for the reservation.
+ */
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 released = 0;
+ u64 qgroup_to_release = 0;
+
+ /*
+ * Since we statically set the block_rsv->size we just want to say we
+ * are releasing 0 bytes, and then we'll just get the reservation over
+ * the size free'd.
+ */
+ released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
+ &qgroup_to_release);
+ if (released > 0)
+ trace_btrfs_space_reservation(fs_info, "delalloc",
+ btrfs_ino(inode), released, 0);
+ if (qgroup_free)
+ btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
+ else
+ btrfs_qgroup_convert_reserved_meta(inode->root,
+ qgroup_to_release);
+}
+
+static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *inode)
+{
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 reserve_size = 0;
+ u64 qgroup_rsv_size = 0;
+ u64 csum_leaves;
+ unsigned outstanding_extents;
+
+ lockdep_assert_held(&inode->lock);
+ outstanding_extents = inode->outstanding_extents;
+
+ /*
+ * Insert size for the number of outstanding extents, 1 normal size for
+ * updating the inode.
+ */
+ if (outstanding_extents) {
+ reserve_size = btrfs_calc_insert_metadata_size(fs_info,
+ outstanding_extents);
+ reserve_size += btrfs_calc_metadata_size(fs_info, 1);
+ }
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
+ inode->csum_bytes);
+ reserve_size += btrfs_calc_insert_metadata_size(fs_info,
+ csum_leaves);
+ /*
+ * For qgroup rsv, the calculation is very simple:
+ * account one nodesize for each outstanding extent
+ *
+ * This is overestimating in most cases.
+ */
+ qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
+
+ spin_lock(&block_rsv->lock);
+ block_rsv->size = reserve_size;
+ block_rsv->qgroup_rsv_size = qgroup_rsv_size;
+ spin_unlock(&block_rsv->lock);
+}
+
+static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
+ u64 num_bytes, u64 *meta_reserve,
+ u64 *qgroup_reserve)
+{
+ u64 nr_extents = count_max_extents(num_bytes);
+ u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
+ u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
+
+ *meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
+ nr_extents + csum_leaves);
+
+ /*
+ * finish_ordered_io has to update the inode, so add the space required
+ * for an inode update.
+ */
+ *meta_reserve += inode_update;
+ *qgroup_reserve = nr_extents * fs_info->nodesize;
+}
+
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 meta_reserve, qgroup_reserve;
+ unsigned nr_extents;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
+ int ret = 0;
+
+ /*
+ * If we are a free space inode we need to not flush since we will be in
+ * the middle of a transaction commit. We also don't need the delalloc
+ * mutex since we won't race with anybody. We need this mostly to make
+ * lockdep shut its filthy mouth.
+ *
+ * If we have a transaction open (can happen if we call truncate_block
+ * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
+ */
+ if (btrfs_is_free_space_inode(inode)) {
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ } else {
+ if (current->journal_info)
+ flush = BTRFS_RESERVE_FLUSH_LIMIT;
+
+ if (btrfs_transaction_in_commit(fs_info))
+ schedule_timeout(1);
+ }
+
+ num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+
+ /*
+ * We always want to do it this way, every other way is wrong and ends
+ * in tears. Pre-reserving the amount we are going to add will always
+ * be the right way, because otherwise if we have enough parallelism we
+ * could end up with thousands of inodes all holding little bits of
+ * reservations they were able to make previously and the only way to
+ * reclaim that space is to ENOSPC out the operations and clear
+ * everything out and try again, which is bad. This way we just
+ * over-reserve slightly, and clean up the mess when we are done.
+ */
+ calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
+ &qgroup_reserve);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
+ if (ret)
+ return ret;
+ ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
+ if (ret) {
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
+ return ret;
+ }
+
+ /*
+ * Now we need to update our outstanding extents and csum bytes _first_
+ * and then add the reservation to the block_rsv. This keeps us from
+ * racing with an ordered completion or some such that would think it
+ * needs to free the reservation we just made.
+ */
+ spin_lock(&inode->lock);
+ nr_extents = count_max_extents(num_bytes);
+ btrfs_mod_outstanding_extents(inode, nr_extents);
+ inode->csum_bytes += num_bytes;
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
+ /* Now we can safely add our space to our block rsv */
+ btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false);
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), meta_reserve, 1);
+
+ spin_lock(&block_rsv->lock);
+ block_rsv->qgroup_rsv_reserved += qgroup_reserve;
+ spin_unlock(&block_rsv->lock);
+
+ return 0;
+}
+
+/**
+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * @inode: the inode to release the reservation for.
+ * @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
+ *
+ * This will release the metadata reservation for an inode. This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations, or on error for the same reason.
+ */
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+ spin_lock(&inode->lock);
+ inode->csum_bytes -= num_bytes;
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
+ if (btrfs_is_testing(fs_info))
+ return;
+
+ btrfs_inode_rsv_release(inode, qgroup_free);
+}
+
+/**
+ * btrfs_delalloc_release_extents - release our outstanding_extents
+ * @inode: the inode to balance the reservation for.
+ * @num_bytes: the number of bytes we originally reserved with
+ *
+ * When we reserve space we increase outstanding_extents for the extents we may
+ * add. Once we've set the range as delalloc or created our ordered extents we
+ * have outstanding_extents to track the real usage, so we use this to free our
+ * temporarily tracked outstanding_extents. This _must_ be used in conjunction
+ * with btrfs_delalloc_reserve_metadata.
+ */
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ unsigned num_extents;
+
+ spin_lock(&inode->lock);
+ num_extents = count_max_extents(num_bytes);
+ btrfs_mod_outstanding_extents(inode, -num_extents);
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
+ if (btrfs_is_testing(fs_info))
+ return;
+
+ btrfs_inode_rsv_release(inode, true);
+}
+
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
+ * @inode: inode we're writing to
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ * @reserved: mandatory parameter, record actually reserved qgroup ranges of
+ * current reservation.
+ *
+ * This will do the following things
+ *
+ * - reserve space in data space info for num bytes
+ * and reserve precious corresponding qgroup space
+ * (Done in check_data_free_space)
+ *
+ * - reserve space for metadata space, based on the number of outstanding
+ * extents and how much csums will be needed
+ * also reserve metadata space in a per root over-reserve method.
+ * - add to the inodes->delalloc_bytes
+ * - add it to the fs_info's delalloc inodes list.
+ * (Above 3 all done in delalloc_reserve_metadata)
+ *
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
+ */
+int btrfs_delalloc_reserve_space(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len)
+{
+ int ret;
+
+ ret = btrfs_check_data_free_space(inode, reserved, start, len);
+ if (ret < 0)
+ return ret;
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
+ if (ret < 0)
+ btrfs_free_reserved_data_space(inode, *reserved, start, len);
+ return ret;
+}
+
+/**
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
+ * @release_bytes: the len of the space we consumed or didn't use
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
+ */
+void btrfs_delalloc_release_space(struct inode *inode,
+ struct extent_changeset *reserved,
+ u64 start, u64 len, bool qgroup_free)
+{
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
+ btrfs_free_reserved_data_space(inode, reserved, start, len);
+}
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
new file mode 100644
index 000000000000..54466fbd7075
--- /dev/null
+++ b/fs/btrfs/delalloc-space.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DELALLOC_SPACE_H
+#define BTRFS_DELALLOC_SPACE_H
+
+struct extent_changeset;
+
+int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len);
+void btrfs_free_reserved_data_space(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode,
+ struct extent_changeset *reserved,
+ u64 start, u64 len, bool qgroup_free);
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ u64 len);
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free);
+int btrfs_delalloc_reserve_space(struct inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len);
+
+#endif /* BTRFS_DELALLOC_SPACE_H */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index c669f250d4a0..d3e15e1d4a91 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -6,11 +6,13 @@
#include <linux/slab.h>
#include <linux/iversion.h>
+#include "misc.h"
#include "delayed-inode.h"
#include "disk-io.h"
#include "transaction.h"
#include "ctree.h"
#include "qgroup.h"
+#include "locking.h"
#define BTRFS_DELAYED_WRITEBACK 512
#define BTRFS_DELAYED_BACKGROUND 128
@@ -474,6 +476,9 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
struct rb_root_cached *root;
struct btrfs_delayed_root *delayed_root;
+ /* Not associated with any delayed_node */
+ if (!delayed_item->delayed_node)
+ return;
delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
BUG_ON(!delayed_root);
@@ -555,7 +560,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
src_rsv = trans->block_rsv;
dst_rsv = &fs_info->delayed_block_rsv;
- num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
/*
* Here we migrate space rsv from transaction rsv, since have already
@@ -609,7 +614,7 @@ static int btrfs_delayed_inode_reserve_metadata(
src_rsv = trans->block_rsv;
dst_rsv = &fs_info->delayed_block_rsv;
- num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+ num_bytes = btrfs_calc_metadata_size(fs_info, 1);
/*
* btrfs_dirty_inode will update the inode under btrfs_join_transaction
@@ -691,7 +696,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_delayed_item *item)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr, *next;
int free_space;
int total_data_size = 0, total_size = 0;
@@ -708,7 +712,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
BUG_ON(!path->nodes[0]);
leaf = path->nodes[0];
- free_space = btrfs_leaf_free_space(fs_info, leaf);
+ free_space = btrfs_leaf_free_space(leaf);
INIT_LIST_HEAD(&head);
next = item;
@@ -1364,8 +1368,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
return -ENOMEM;
async_work->delayed_root = delayed_root;
- btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
- btrfs_async_run_delayed_root, NULL, NULL);
+ btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL,
+ NULL);
async_work->nr = nr;
btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
@@ -1526,7 +1530,12 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
* we have reserved enough space when we start a new transaction,
* so reserving metadata failure is impossible.
*/
- BUG_ON(ret);
+ if (ret < 0) {
+ btrfs_err(trans->fs_info,
+"metadata reservation failed for delayed dir item deltiona, should have been reserved");
+ btrfs_release_delayed_item(item);
+ goto end;
+ }
mutex_lock(&node->mutex);
ret = __btrfs_add_delayed_deletion_item(node, item);
@@ -1535,7 +1544,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
index, node->root->root_key.objectid,
node->inode_id, ret);
- BUG();
+ btrfs_delayed_item_release_metadata(dir->root, item);
+ btrfs_release_delayed_item(item);
}
mutex_unlock(&node->mutex);
end:
@@ -1692,7 +1702,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
name = (char *)(di + 1);
name_len = btrfs_stack_dir_name_len(di);
- d_type = btrfs_filetype_table[di->type];
+ d_type = fs_ftype_to_dtype(di->type);
btrfs_disk_key_to_cpu(&location, &di->location);
over = !dir_emit(ctx, name, name_len,
@@ -1940,12 +1950,19 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
}
inode_id = delayed_nodes[n - 1]->inode_id + 1;
-
- for (i = 0; i < n; i++)
- refcount_inc(&delayed_nodes[i]->refs);
+ for (i = 0; i < n; i++) {
+ /*
+ * Don't increase refs in case the node is dead and
+ * about to be removed from the tree in the loop below
+ */
+ if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
+ delayed_nodes[i] = NULL;
+ }
spin_unlock(&root->inode_lock);
for (i = 0; i < n; i++) {
+ if (!delayed_nodes[i])
+ continue;
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 7d2a413df90d..df3bd880061d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -10,6 +10,7 @@
#include "delayed-ref.h"
#include "transaction.h"
#include "qgroup.h"
+#include "space-info.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -24,6 +25,179 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep;
* of hammering updates on the extent allocation tree.
*/
+bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ bool ret = false;
+ u64 reserved;
+
+ spin_lock(&global_rsv->lock);
+ reserved = global_rsv->reserved;
+ spin_unlock(&global_rsv->lock);
+
+ /*
+ * Since the global reserve is just kind of magic we don't really want
+ * to rely on it to save our bacon, so if our size is more than the
+ * delayed_refs_rsv and the global rsv then it's time to think about
+ * bailing.
+ */
+ spin_lock(&delayed_refs_rsv->lock);
+ reserved += delayed_refs_rsv->reserved;
+ if (delayed_refs_rsv->size >= reserved)
+ ret = true;
+ spin_unlock(&delayed_refs_rsv->lock);
+ return ret;
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
+{
+ u64 num_entries =
+ atomic_read(&trans->transaction->delayed_refs.num_entries);
+ u64 avg_runtime;
+ u64 val;
+
+ smp_mb();
+ avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
+ val = num_entries * avg_runtime;
+ if (val >= NSEC_PER_SEC)
+ return 1;
+ if (val >= NSEC_PER_SEC / 2)
+ return 2;
+
+ return btrfs_check_space_for_delayed_refs(trans->fs_info);
+}
+
+/**
+ * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
+ * @fs_info - the fs_info for our fs.
+ * @nr - the number of items to drop.
+ *
+ * This drops the delayed ref head's count from the delayed refs rsv and frees
+ * any excess reservation we had.
+ */
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
+{
+ struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+ u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
+ u64 released = 0;
+
+ released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes,
+ NULL);
+ if (released)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, released, 0);
+}
+
+/*
+ * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
+ * @trans - the trans that may have generated delayed refs
+ *
+ * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
+ * it'll calculate the additional size and add it to the delayed_refs_rsv.
+ */
+void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ u64 num_bytes;
+
+ if (!trans->delayed_ref_updates)
+ return;
+
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info,
+ trans->delayed_ref_updates);
+ spin_lock(&delayed_rsv->lock);
+ delayed_rsv->size += num_bytes;
+ delayed_rsv->full = 0;
+ spin_unlock(&delayed_rsv->lock);
+ trans->delayed_ref_updates = 0;
+}
+
+/**
+ * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
+ * @fs_info - the fs info for our fs.
+ * @src - the source block rsv to transfer from.
+ * @num_bytes - the number of bytes to transfer.
+ *
+ * This transfers up to the num_bytes amount from the src rsv to the
+ * delayed_refs_rsv. Any extra bytes are returned to the space info.
+ */
+void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *src,
+ u64 num_bytes)
+{
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ u64 to_free = 0;
+
+ spin_lock(&src->lock);
+ src->reserved -= num_bytes;
+ src->size -= num_bytes;
+ spin_unlock(&src->lock);
+
+ spin_lock(&delayed_refs_rsv->lock);
+ if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
+ u64 delta = delayed_refs_rsv->size -
+ delayed_refs_rsv->reserved;
+ if (num_bytes > delta) {
+ to_free = num_bytes - delta;
+ num_bytes = delta;
+ }
+ } else {
+ to_free = num_bytes;
+ num_bytes = 0;
+ }
+
+ if (num_bytes)
+ delayed_refs_rsv->reserved += num_bytes;
+ if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
+ delayed_refs_rsv->full = 1;
+ spin_unlock(&delayed_refs_rsv->lock);
+
+ if (num_bytes)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, num_bytes, 1);
+ if (to_free)
+ btrfs_space_info_free_bytes_may_use(fs_info,
+ delayed_refs_rsv->space_info, to_free);
+}
+
+/**
+ * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
+ * @fs_info - the fs_info for our fs.
+ * @flush - control how we can flush for this reservation.
+ *
+ * This will refill the delayed block_rsv up to 1 items size worth of space and
+ * will return -ENOSPC if we can't make the reservation.
+ */
+int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+ u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1);
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size) {
+ num_bytes = block_rsv->size - block_rsv->reserved;
+ num_bytes = min(num_bytes, limit);
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (!num_bytes)
+ return 0;
+
+ ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
+ num_bytes, flush);
+ if (ret)
+ return ret;
+ btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, num_bytes, 1);
+ return 0;
+}
+
/*
* compare two delayed tree backrefs with same bytenr and type
*/
@@ -735,8 +909,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
* transaction commits.
*/
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 ref_root, int level, int action,
+ struct btrfs_ref *generic_ref,
struct btrfs_delayed_extent_op *extent_op,
int *old_ref_mod, int *new_ref_mod)
{
@@ -746,10 +919,18 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
int qrecord_inserted;
- bool is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID);
+ bool is_system;
+ int action = generic_ref->action;
+ int level = generic_ref->tree_ref.level;
int ret;
+ u64 bytenr = generic_ref->bytenr;
+ u64 num_bytes = generic_ref->len;
+ u64 parent = generic_ref->parent;
u8 ref_type;
+ is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
+
+ ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
@@ -762,7 +943,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- is_fstree(ref_root)) {
+ is_fstree(generic_ref->real_root) &&
+ is_fstree(generic_ref->tree_ref.root) &&
+ !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
@@ -777,13 +960,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref_type = BTRFS_TREE_BLOCK_REF_KEY;
init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- ref_root, action, ref_type);
- ref->root = ref_root;
+ generic_ref->tree_ref.root, action, ref_type);
+ ref->root = generic_ref->tree_ref.root;
ref->parent = parent;
ref->level = level;
init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- ref_root, 0, action, false, is_system);
+ generic_ref->tree_ref.root, 0, action, false,
+ is_system);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -822,10 +1006,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
* add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
*/
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes,
- u64 parent, u64 ref_root,
- u64 owner, u64 offset, u64 reserved, int action,
- int *old_ref_mod, int *new_ref_mod)
+ struct btrfs_ref *generic_ref,
+ u64 reserved, int *old_ref_mod,
+ int *new_ref_mod)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_data_ref *ref;
@@ -833,9 +1016,17 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
int qrecord_inserted;
+ int action = generic_ref->action;
int ret;
+ u64 bytenr = generic_ref->bytenr;
+ u64 num_bytes = generic_ref->len;
+ u64 parent = generic_ref->parent;
+ u64 ref_root = generic_ref->data_ref.ref_root;
+ u64 owner = generic_ref->data_ref.ino;
+ u64 offset = generic_ref->data_ref.offset;
u8 ref_type;
+ ASSERT(generic_ref->type == BTRFS_REF_DATA && action);
ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
@@ -859,7 +1050,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- is_fstree(ref_root)) {
+ is_fstree(ref_root) &&
+ is_fstree(generic_ref->real_root) &&
+ !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
@@ -905,8 +1098,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op)
{
@@ -939,13 +1131,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
}
/*
- * this does a simple search for the head node for a given extent.
- * It must be called with the delayed ref spinlock held, and it returns
- * the head node if any where found, or NULL if not.
+ * This does a simple search for the head node for a given extent. Returns the
+ * head node if found, or NULL if not.
*/
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
{
+ lockdep_assert_held(&delayed_refs->lock);
+
return find_ref_head(delayed_refs, bytenr, false);
}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 70606da440aa..1c977e6d45dc 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -176,6 +176,83 @@ struct btrfs_delayed_ref_root {
u64 qgroup_to_skip;
};
+enum btrfs_ref_type {
+ BTRFS_REF_NOT_SET,
+ BTRFS_REF_DATA,
+ BTRFS_REF_METADATA,
+ BTRFS_REF_LAST,
+};
+
+struct btrfs_data_ref {
+ /* For EXTENT_DATA_REF */
+
+ /* Root which refers to this data extent */
+ u64 ref_root;
+
+ /* Inode which refers to this data extent */
+ u64 ino;
+
+ /*
+ * file_offset - extent_offset
+ *
+ * file_offset is the key.offset of the EXTENT_DATA key.
+ * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data.
+ */
+ u64 offset;
+};
+
+struct btrfs_tree_ref {
+ /*
+ * Level of this tree block
+ *
+ * Shared for skinny (TREE_BLOCK_REF) and normal tree ref.
+ */
+ int level;
+
+ /*
+ * Root which refers to this tree block.
+ *
+ * For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
+ */
+ u64 root;
+
+ /* For non-skinny metadata, no special member needed */
+};
+
+struct btrfs_ref {
+ enum btrfs_ref_type type;
+ int action;
+
+ /*
+ * Whether this extent should go through qgroup record.
+ *
+ * Normally false, but for certain cases like delayed subtree scan,
+ * setting this flag can hugely reduce qgroup overhead.
+ */
+ bool skip_qgroup;
+
+ /*
+ * Optional. For which root is this modification.
+ * Mostly used for qgroup optimization.
+ *
+ * When unset, data/tree ref init code will populate it.
+ * In certain cases, we're modifying reference for a different root.
+ * E.g. COW fs tree blocks for balance.
+ * In that case, tree_ref::root will be fs tree, but we're doing this
+ * for reloc tree, then we should set @real_root to reloc tree.
+ */
+ u64 real_root;
+ u64 bytenr;
+ u64 len;
+
+ /* Bytenr of the parent tree block */
+ u64 parent;
+ union {
+ struct btrfs_data_ref data_ref;
+ struct btrfs_tree_ref tree_ref;
+ };
+};
+
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
@@ -184,6 +261,38 @@ extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
int __init btrfs_delayed_ref_init(void);
void __cold btrfs_delayed_ref_exit(void);
+static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
+ int action, u64 bytenr, u64 len, u64 parent)
+{
+ generic_ref->action = action;
+ generic_ref->bytenr = bytenr;
+ generic_ref->len = len;
+ generic_ref->parent = parent;
+}
+
+static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
+ int level, u64 root)
+{
+ /* If @real_root not set, use @root as fallback */
+ if (!generic_ref->real_root)
+ generic_ref->real_root = root;
+ generic_ref->tree_ref.level = level;
+ generic_ref->tree_ref.root = root;
+ generic_ref->type = BTRFS_REF_METADATA;
+}
+
+static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
+ u64 ref_root, u64 ino, u64 offset)
+{
+ /* If @real_root not set, use @root as fallback */
+ if (!generic_ref->real_root)
+ generic_ref->real_root = ref_root;
+ generic_ref->data_ref.ref_root = ref_root;
+ generic_ref->data_ref.ino = ino;
+ generic_ref->data_ref.offset = offset;
+ generic_ref->type = BTRFS_REF_DATA;
+}
+
static inline struct btrfs_delayed_extent_op *
btrfs_alloc_delayed_extent_op(void)
{
@@ -224,17 +333,14 @@ static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *hea
}
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 ref_root, int level, int action,
+ struct btrfs_ref *generic_ref,
struct btrfs_delayed_extent_op *extent_op,
int *old_ref_mod, int *new_ref_mod);
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes,
- u64 parent, u64 ref_root,
- u64 owner, u64 offset, u64 reserved, int action,
- int *old_ref_mod, int *new_ref_mod);
-int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref,
+ u64 reserved, int *old_ref_mod,
+ int *new_ref_mod);
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
@@ -258,6 +364,16 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
+int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+ enum btrfs_reserve_flush_enum flush);
+void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *src,
+ u64 num_bytes);
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
+bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
+
/*
* helper functions to cast a node into its container
*/
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ee193c5222b2..ba4d8f375b3c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -9,6 +9,7 @@
#include <linux/blkdev.h>
#include <linux/kthread.h>
#include <linux/math64.h>
+#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
@@ -56,7 +57,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
no_valid_dev_replace_entry_found:
ret = 0;
dev_replace->replace_state =
- BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+ BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->cont_reading_from_srcdev_mode =
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
dev_replace->time_started = 0;
@@ -201,7 +202,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return PTR_ERR(bdev);
}
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
+ sync_blockdev(bdev);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
@@ -237,7 +238,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
rcu_assign_pointer(device->name, name);
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
device->io_width = fs_info->sectorsize;
@@ -256,6 +256,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_info->fs_devices;
+
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
fs_info->fs_devices->open_devices++;
@@ -273,9 +275,9 @@ error:
* called from commit_transaction. Writes changed device replace state to
* disk.
*/
-int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_path *path;
@@ -399,7 +401,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
int ret;
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
- bool need_unlock;
src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
srcdev_name);
@@ -413,11 +414,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return -ETXTBSY;
}
- ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
- src_device, &tgt_device);
- if (ret)
- return ret;
-
/*
* Here we commit the transaction to make sure commit_total_bytes
* of all the devices are updated.
@@ -431,7 +427,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return PTR_ERR(trans);
}
- need_unlock = true;
+ ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
+ src_device, &tgt_device);
+ if (ret)
+ return ret;
+
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -442,11 +442,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
ASSERT(0);
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+ up_write(&dev_replace->rwsem);
goto leave;
}
dev_replace->cont_reading_from_srcdev_mode = read_src;
- WARN_ON(!src_device);
dev_replace->srcdev = src_device;
dev_replace->tgtdev = tgt_device;
@@ -471,7 +471,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
- need_unlock = false;
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
if (ret)
@@ -479,16 +478,16 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- /* force writing the updated state information to disk */
- trans = btrfs_start_transaction(root, 0);
+ /* Commit dev_replace state and reserve 1 item for it. */
+ trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- need_unlock = true;
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
+ up_write(&dev_replace->rwsem);
goto leave;
}
@@ -501,17 +500,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
- if (ret == -EINPROGRESS) {
+ if (ret == -EINPROGRESS)
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
- } else if (ret != -ECANCELED) {
- WARN_ON(ret);
- }
return ret;
leave:
- if (need_unlock)
- up_write(&dev_replace->rwsem);
btrfs_destroy_dev_replace_tgtdev(tgt_device);
return ret;
}
@@ -603,17 +597,33 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return PTR_ERR(trans);
+ /*
+ * We have to use this loop approach because at this point src_device
+ * has to be available for transaction commit to complete, yet new
+ * chunks shouldn't be allocated on the device.
+ */
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans);
+ WARN_ON(ret);
+
+ /* Prevent write_all_supers() during the finishing procedure */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ /* Prevent new chunks being allocated on the source device */
+ mutex_lock(&fs_info->chunk_mutex);
+
+ if (!list_empty(&src_device->post_commit_list)) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
+ } else {
+ break;
+ }
}
- ret = btrfs_commit_transaction(trans);
- WARN_ON(ret);
- /* keep away write_all_supers() during the finishing procedure */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- mutex_lock(&fs_info->chunk_mutex);
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -662,8 +672,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_device_set_disk_total_bytes(tgt_device,
src_device->disk_total_bytes);
btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
- ASSERT(list_empty(&src_device->resized_list));
- tgt_device->commit_total_bytes = src_device->commit_total_bytes;
tgt_device->commit_bytes_used = src_device->bytes_used;
btrfs_assign_next_active_device(src_device, tgt_device);
@@ -696,7 +704,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* replace the sysfs entry */
btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
- btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
+ btrfs_rm_dev_replace_free_srcdev(src_device);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -713,7 +721,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
u64 start = 0;
@@ -975,7 +983,7 @@ static int btrfs_dev_replace_kthread(void *data)
return 0;
}
-int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
{
if (!dev_replace->is_valid)
return 0;
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 4aa40bacc6cc..60b70dacc299 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -9,8 +9,7 @@
struct btrfs_ioctl_dev_replace_args;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
-int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
@@ -18,6 +17,6 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
-int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 8de74d835dba..863367c2c620 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -36,7 +36,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
- btrfs_extend_item(fs_info, path, data_size);
+ btrfs_extend_item(path, data_size);
} else if (ret < 0)
return ERR_PTR(ret);
WARN_ON(ret > 0);
@@ -429,8 +429,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_len - (ptr + sub_item_len - start));
- btrfs_truncate_item(root->fs_info, path,
- item_len - sub_item_len, 1);
+ btrfs_truncate_item(path, item_len - sub_item_len, 1);
}
return ret;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6fe9197f6ee4..e0edfdc9c82b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -19,6 +19,7 @@
#include <linux/crc32c.h>
#include <linux/sched/mm.h>
#include <asm/unaligned.h>
+#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -39,10 +40,7 @@
#include "compression.h"
#include "tree-checker.h"
#include "ref-verify.h"
-
-#ifdef CONFIG_X86
-#include <asm/cpufeature.h>
-#endif
+#include "block-group.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -207,7 +205,6 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset, u64 start, u64 len,
int create)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
int ret;
@@ -215,7 +212,6 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em) {
- em->bdev = fs_info->fs_devices->latest_bdev;
read_unlock(&em_tree->lock);
goto out;
}
@@ -230,7 +226,6 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode,
em->len = (u64)-1;
em->block_len = (u64)-1;
em->block_start = 0;
- em->bdev = fs_info->fs_devices->latest_bdev;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
@@ -249,26 +244,15 @@ out:
return em;
}
-u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
-{
- return crc32c(seed, data, len);
-}
-
-void btrfs_csum_final(u32 crc, u8 *result)
-{
- put_unaligned_le32(~crc, result);
-}
-
/*
- * compute the csum for a btree block, and either verify it or write it
- * into the csum field of the block.
+ * Compute the csum of a btree block and store the result to provided buffer.
+ *
+ * Returns error if the extent buffer cannot be mapped.
*/
-static int csum_tree_block(struct btrfs_fs_info *fs_info,
- struct extent_buffer *buf,
- int verify)
+static int csum_tree_block(struct extent_buffer *buf, u8 *result)
{
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- char result[BTRFS_CSUM_SIZE];
+ struct btrfs_fs_info *fs_info = buf->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
unsigned long len;
unsigned long cur_len;
unsigned long offset = BTRFS_CSUM_SIZE;
@@ -276,9 +260,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
unsigned long map_start;
unsigned long map_len;
int err;
- u32 crc = ~(u32)0;
+
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
len = buf->len - offset;
+
while (len > 0) {
/*
* Note: we don't need to check for the err == 1 case here, as
@@ -288,34 +275,16 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
*/
err = map_private_extent_buffer(buf, offset, 32,
&kaddr, &map_start, &map_len);
- if (err)
+ if (WARN_ON(err))
return err;
cur_len = min(len, map_len - (offset - map_start));
- crc = btrfs_csum_data(kaddr + offset - map_start,
- crc, cur_len);
+ crypto_shash_update(shash, kaddr + offset - map_start, cur_len);
len -= cur_len;
offset += cur_len;
}
memset(result, 0, BTRFS_CSUM_SIZE);
- btrfs_csum_final(crc, result);
-
- if (verify) {
- if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
- u32 val;
- u32 found = 0;
- memcpy(&found, result, csum_size);
-
- read_extent_buffer(buf, &val, 0, csum_size);
- btrfs_warn_rl(fs_info,
- "%s checksum verify failed on %llu wanted %X found %X level %d",
- fs_info->sb->s_id, buf->start,
- val, found, btrfs_header_level(buf));
- return -EUCLEAN;
- }
- } else {
- write_extent_buffer(buf, result, 0, csum_size);
- }
+ crypto_shash_final(shash, result);
return 0;
}
@@ -376,6 +345,19 @@ out:
return ret;
}
+static bool btrfs_supported_super_csum(u16 csum_type)
+{
+ switch (csum_type) {
+ case BTRFS_CSUM_TYPE_CRC32:
+ case BTRFS_CSUM_TYPE_XXHASH:
+ case BTRFS_CSUM_TYPE_SHA256:
+ case BTRFS_CSUM_TYPE_BLAKE2:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* Return 0 if the superblock checksum type matches the checksum value of that
* algorithm. Pass the raw disk superblock data.
@@ -385,51 +367,42 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
{
struct btrfs_super_block *disk_sb =
(struct btrfs_super_block *)raw_disk_sb;
- u16 csum_type = btrfs_super_csum_type(disk_sb);
- int ret = 0;
-
- if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
- u32 crc = ~(u32)0;
- char result[sizeof(crc)];
+ char result[BTRFS_CSUM_SIZE];
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- /*
- * The super_block structure does not span the whole
- * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
- * is filled with zeros and is included in the checksum.
- */
- crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
- crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
- btrfs_csum_final(crc, result);
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
- if (memcmp(raw_disk_sb, result, sizeof(result)))
- ret = 1;
- }
+ /*
+ * The super_block structure does not span the whole
+ * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
+ * filled with zeros and is included in the checksum.
+ */
+ crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+ crypto_shash_final(shash, result);
- if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
- btrfs_err(fs_info, "unsupported checksum algorithm %u",
- csum_type);
- ret = 1;
- }
+ if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
+ return 1;
- return ret;
+ return 0;
}
-static int verify_level_key(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int level,
- struct btrfs_key *first_key, u64 parent_transid)
+int btrfs_verify_level_key(struct extent_buffer *eb, int level,
+ struct btrfs_key *first_key, u64 parent_transid)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
int found_level;
struct btrfs_key found_key;
int ret;
found_level = btrfs_header_level(eb);
if (found_level != level) {
-#ifdef CONFIG_BTRFS_DEBUG
- WARN_ON(1);
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+ KERN_ERR "BTRFS: tree level check failed\n");
btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
eb->start, level, found_level);
-#endif
return -EIO;
}
@@ -444,15 +417,25 @@ static int verify_level_key(struct btrfs_fs_info *fs_info,
*/
if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
return 0;
+
+ /* We have @first_key, so this @eb must have at least one item */
+ if (btrfs_header_nritems(eb) == 0) {
+ btrfs_err(fs_info,
+ "invalid tree nritems, bytenr=%llu nritems=0 expect >0",
+ eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ return -EUCLEAN;
+ }
+
if (found_level)
btrfs_node_key_to_cpu(eb, &found_key, 0);
else
btrfs_item_key_to_cpu(eb, &found_key, 0);
ret = btrfs_comp_cpu_keys(first_key, &found_key);
-#ifdef CONFIG_BTRFS_DEBUG
if (ret) {
- WARN_ON(1);
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+ KERN_ERR "BTRFS: tree first key check failed\n");
btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
eb->start, parent_transid, first_key->objectid,
@@ -460,7 +443,6 @@ static int verify_level_key(struct btrfs_fs_info *fs_info,
found_key.objectid, found_key.type,
found_key.offset);
}
-#endif
return ret;
}
@@ -472,11 +454,11 @@ static int verify_level_key(struct btrfs_fs_info *fs_info,
* @level: expected level, mandatory check
* @first_key: expected key of first slot, skip check if NULL
*/
-static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb,
+static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
u64 parent_transid, int level,
struct btrfs_key *first_key)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
int failed = 0;
int ret;
@@ -487,14 +469,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
while (1) {
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
- ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
- mirror_num);
+ ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
if (!ret) {
if (verify_parent_transid(io_tree, eb,
parent_transid, 0))
ret = -EIO;
- else if (verify_level_key(fs_info, eb, level,
- first_key, parent_transid))
+ else if (btrfs_verify_level_key(eb, level,
+ first_key, parent_transid))
ret = -EUCLEAN;
else
break;
@@ -519,7 +500,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
}
if (failed && !ret && failed_mirror)
- repair_eb_io_failure(fs_info, eb, failed_mirror);
+ btrfs_repair_eb_io_failure(eb, failed_mirror);
return ret;
}
@@ -533,7 +514,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
{
u64 start = page_offset(page);
u64 found_start;
+ u8 result[BTRFS_CSUM_SIZE];
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
struct extent_buffer *eb;
+ int ret;
eb = (struct extent_buffer *)page->private;
if (page != eb->pages[0])
@@ -552,12 +536,30 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
- return csum_tree_block(fs_info, eb, 0);
+ if (csum_tree_block(eb, result))
+ return -EINVAL;
+
+ if (btrfs_header_level(eb))
+ ret = btrfs_check_node(eb);
+ else
+ ret = btrfs_check_leaf_full(eb);
+
+ if (ret < 0) {
+ btrfs_print_tree(eb, 0);
+ btrfs_err(fs_info,
+ "block=%llu write time tree block corruption detected",
+ eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ return ret;
+ }
+ write_extent_buffer(eb, result, 0, csum_size);
+
+ return 0;
}
-static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
+static int check_tree_block_fsid(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u8 fsid[BTRFS_FSID_SIZE];
int ret = 1;
@@ -595,7 +597,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int ret = 0;
+ u8 result[BTRFS_CSUM_SIZE];
int reads_done;
if (!page->private)
@@ -606,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
/* the pending IO might have been the only thing that kept this buffer
* in memory. Make sure we have a ref for all this other checks
*/
- extent_buffer_get(eb);
+ atomic_inc(&eb->refs);
reads_done = atomic_dec_and_test(&eb->io_pages);
if (!reads_done)
@@ -625,7 +629,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
ret = -EIO;
goto err;
}
- if (check_tree_block_fsid(fs_info, eb)) {
+ if (check_tree_block_fsid(eb)) {
btrfs_err_rl(fs_info, "bad fsid on block %llu",
eb->start);
ret = -EIO;
@@ -642,25 +646,44 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
eb, found_level);
- ret = csum_tree_block(fs_info, eb, 1);
+ ret = csum_tree_block(eb, result);
if (ret)
goto err;
+ if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
+ u32 val;
+ u32 found = 0;
+
+ memcpy(&found, result, csum_size);
+
+ read_extent_buffer(eb, &val, 0, csum_size);
+ btrfs_warn_rl(fs_info,
+ "%s checksum verify failed on %llu wanted %x found %x level %d",
+ fs_info->sb->s_id, eb->start,
+ val, found, btrfs_header_level(eb));
+ ret = -EUCLEAN;
+ goto err;
+ }
+
/*
* If this is a leaf block and it is corrupt, set the corrupt bit so
* that we don't try and read the other copies of this block, just
* return -EIO.
*/
- if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) {
+ if (found_level == 0 && btrfs_check_leaf_full(eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
- if (found_level > 0 && btrfs_check_node(fs_info, eb))
+ if (found_level > 0 && btrfs_check_node(eb))
ret = -EIO;
if (!ret)
set_extent_buffer_uptodate(eb);
+ else
+ btrfs_err(fs_info,
+ "block=%llu read time tree block corruption detected",
+ eb->start);
err:
if (reads_done &&
test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@ -685,43 +708,31 @@ static void end_workqueue_bio(struct bio *bio)
struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
struct btrfs_workqueue *wq;
- btrfs_work_func_t func;
fs_info = end_io_wq->info;
end_io_wq->status = bio->bi_status;
if (bio_op(bio) == REQ_OP_WRITE) {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
wq = fs_info->endio_meta_write_workers;
- func = btrfs_endio_meta_write_helper;
- } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
wq = fs_info->endio_freespace_worker;
- func = btrfs_freespace_write_helper;
- } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
wq = fs_info->endio_raid56_workers;
- func = btrfs_endio_raid56_helper;
- } else {
+ else
wq = fs_info->endio_write_workers;
- func = btrfs_endio_write_helper;
- }
} else {
- if (unlikely(end_io_wq->metadata ==
- BTRFS_WQ_ENDIO_DIO_REPAIR)) {
+ if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR))
wq = fs_info->endio_repair_workers;
- func = btrfs_endio_repair_helper;
- } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
wq = fs_info->endio_raid56_workers;
- func = btrfs_endio_raid56_helper;
- } else if (end_io_wq->metadata) {
+ else if (end_io_wq->metadata)
wq = fs_info->endio_meta_workers;
- func = btrfs_endio_meta_helper;
- } else {
+ else
wq = fs_info->endio_workers;
- func = btrfs_endio_helper;
- }
}
- btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
+ btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
btrfs_queue_work(wq, &end_io_wq->work);
}
@@ -782,8 +793,13 @@ static void run_one_async_done(struct btrfs_work *work)
return;
}
- ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio,
- async->mirror_num, 1);
+ /*
+ * All of the bios that pass through here are from async helpers.
+ * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
+ * This changes nothing when cgroups aren't in use.
+ */
+ async->bio->bi_opf |= REQ_CGROUP_PUNT;
+ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
if (ret) {
async->bio->bi_status = ret;
bio_endio(async->bio);
@@ -814,8 +830,8 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
- btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
- run_one_async_done, run_one_async_free);
+ btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
+ run_one_async_free);
async->bio_offset = bio_offset;
@@ -832,11 +848,11 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
{
struct bio_vec *bvec;
struct btrfs_root *root;
- int i, ret = 0;
+ int ret = 0;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i, iter_all) {
+ bio_for_each_segment_all(bvec, bio, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
if (ret)
@@ -856,24 +872,22 @@ static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
return btree_csum_one_bio(bio);
}
-static int check_async_write(struct btrfs_inode *bi)
+static int check_async_write(struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *bi)
{
if (atomic_read(&bi->sync_writers))
return 0;
-#ifdef CONFIG_X86
- if (static_cpu_has(X86_FEATURE_XMM4_2))
+ if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
return 0;
-#endif
return 1;
}
-static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
+ int mirror_num,
+ unsigned long bio_flags)
{
- struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int async = check_async_write(BTRFS_I(inode));
+ int async = check_async_write(fs_info, BTRFS_I(inode));
blk_status_t ret;
if (bio_op(bio) != REQ_OP_WRITE) {
@@ -885,20 +899,19 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else if (!async) {
ret = btree_csum_one_bio(bio);
if (ret)
goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else {
/*
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
- bio_offset, private_data,
- btree_submit_bio_start);
+ 0, inode, btree_submit_bio_start);
}
if (ret)
@@ -1017,46 +1030,17 @@ static const struct address_space_operations btree_aops = {
void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct extent_buffer *buf = NULL;
- struct inode *btree_inode = fs_info->btree_inode;
-
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
- if (IS_ERR(buf))
- return;
- read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
- buf, WAIT_NONE, 0);
- free_extent_buffer(buf);
-}
-
-int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
- int mirror_num, struct extent_buffer **eb)
-{
- struct extent_buffer *buf = NULL;
- struct inode *btree_inode = fs_info->btree_inode;
- struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
int ret;
buf = btrfs_find_create_tree_block(fs_info, bytenr);
if (IS_ERR(buf))
- return 0;
-
- set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
-
- ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
- mirror_num);
- if (ret) {
- free_extent_buffer(buf);
- return ret;
- }
+ return;
- if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
- free_extent_buffer(buf);
- return -EIO;
- } else if (extent_buffer_uptodate(buf)) {
- *eb = buf;
- } else {
+ ret = read_extent_buffer_pages(buf, WAIT_NONE, 0);
+ if (ret < 0)
+ free_extent_buffer_stale(buf);
+ else
free_extent_buffer(buf);
- }
- return 0;
}
struct extent_buffer *btrfs_find_create_tree_block(
@@ -1068,19 +1052,6 @@ struct extent_buffer *btrfs_find_create_tree_block(
return alloc_extent_buffer(fs_info, bytenr);
}
-
-int btrfs_write_tree_block(struct extent_buffer *buf)
-{
- return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
- buf->start + buf->len - 1);
-}
-
-void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
-{
- filemap_fdatawait_range(buf->pages[0]->mapping,
- buf->start, buf->start + buf->len - 1);
-}
-
/*
* Read tree block at logical address @bytenr and do variant basic but critical
* verification.
@@ -1100,19 +1071,19 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
if (IS_ERR(buf))
return buf;
- ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+ ret = btree_read_extent_buffer_pages(buf, parent_transid,
level, first_key);
if (ret) {
- free_extent_buffer(buf);
+ free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
return buf;
}
-void clean_tree_block(struct btrfs_fs_info *fs_info,
- struct extent_buffer *buf)
+void btrfs_clean_tree_block(struct extent_buffer *buf)
{
+ struct btrfs_fs_info *fs_info = buf->fs_info;
if (btrfs_header_generation(buf) ==
fs_info->running_transaction->transid) {
btrfs_assert_tree_locked(buf);
@@ -1208,7 +1179,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->log_transid_committed = -1;
root->last_log_commit = 0;
if (!dummy)
- extent_io_tree_init(&root->dirty_log_pages, NULL);
+ extent_io_tree_init(fs_info, &root->dirty_log_pages,
+ IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
@@ -1255,9 +1227,9 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
#endif
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
u64 objectid)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct extent_buffer *leaf;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root;
@@ -1680,8 +1652,8 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio->bi_status = end_io_wq->status;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
- kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
bio_endio(bio);
+ kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
}
static int cleaner_kthread(void *arg)
@@ -1776,7 +1748,7 @@ static int transaction_kthread(void *arg)
}
now = ktime_get_seconds();
- if (cur->state < TRANS_STATE_BLOCKED &&
+ if (cur->state < TRANS_STATE_COMMIT_START &&
!test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
(now < cur->start_time ||
now - cur->start_time < fs_info->commit_interval)) {
@@ -1815,18 +1787,18 @@ sleep:
}
/*
- * this will find the highest generation in the array of
- * root backups. The index of the highest array is returned,
- * or -1 if we can't find anything.
+ * This will find the highest generation in the array of root backups. The
+ * index of the highest array is returned, or -EINVAL if we can't find
+ * anything.
*
* We check to make sure the array is valid by comparing the
* generation of the latest root in the array with the generation
* in the super block. If they don't match we pitch it.
*/
-static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+static int find_newest_super_backup(struct btrfs_fs_info *info)
{
+ const u64 newest_gen = btrfs_super_generation(info->super_copy);
u64 cur;
- int newest_index = -1;
struct btrfs_root_backup *root_backup;
int i;
@@ -1834,37 +1806,10 @@ static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
root_backup = info->super_copy->super_roots + i;
cur = btrfs_backup_tree_root_gen(root_backup);
if (cur == newest_gen)
- newest_index = i;
- }
-
- /* check to see if we actually wrapped around */
- if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
- root_backup = info->super_copy->super_roots;
- cur = btrfs_backup_tree_root_gen(root_backup);
- if (cur == newest_gen)
- newest_index = 0;
+ return i;
}
- return newest_index;
-}
-
-
-/*
- * find the oldest backup so we know where to store new entries
- * in the backup array. This will set the backup_root_index
- * field in the fs_info struct
- */
-static void find_oldest_super_backup(struct btrfs_fs_info *info,
- u64 newest_gen)
-{
- int newest_index = -1;
- newest_index = find_newest_super_backup(info, newest_gen);
- /* if there was garbage in there, just move along */
- if (newest_index == -1) {
- info->backup_root_index = 0;
- } else {
- info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
- }
+ return -EINVAL;
}
/*
@@ -1874,22 +1819,8 @@ static void find_oldest_super_backup(struct btrfs_fs_info *info,
*/
static void backup_super_roots(struct btrfs_fs_info *info)
{
- int next_backup;
+ const int next_backup = info->backup_root_index;
struct btrfs_root_backup *root_backup;
- int last_backup;
-
- next_backup = info->backup_root_index;
- last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
- BTRFS_NUM_BACKUP_ROOTS;
-
- /*
- * just overwrite the last backup if we're at the same generation
- * this happens only at umount
- */
- root_backup = info->super_for_commit->super_roots + last_backup;
- if (btrfs_backup_tree_root_gen(root_backup) ==
- btrfs_header_generation(info->tree_root->node))
- next_backup = last_backup;
root_backup = info->super_for_commit->super_roots + next_backup;
@@ -1962,40 +1893,31 @@ static void backup_super_roots(struct btrfs_fs_info *info)
}
/*
- * this copies info out of the root backup array and back into
- * the in-memory super block. It is meant to help iterate through
- * the array, so you send it the number of backups you've already
- * tried and the last backup index you used.
+ * read_backup_root - Reads a backup root based on the passed priority. Prio 0
+ * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
+ *
+ * fs_info - filesystem whose backup roots need to be read
+ * priority - priority of backup root required
*
- * this returns -1 when it has tried all the backups
+ * Returns backup root index on success and -EINVAL otherwise.
*/
-static noinline int next_root_backup(struct btrfs_fs_info *info,
- struct btrfs_super_block *super,
- int *num_backups_tried, int *backup_index)
+static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
{
+ int backup_index = find_newest_super_backup(fs_info);
+ struct btrfs_super_block *super = fs_info->super_copy;
struct btrfs_root_backup *root_backup;
- int newest = *backup_index;
- if (*num_backups_tried == 0) {
- u64 gen = btrfs_super_generation(super);
+ if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
+ if (priority == 0)
+ return backup_index;
- newest = find_newest_super_backup(info, gen);
- if (newest == -1)
- return -1;
-
- *backup_index = newest;
- *num_backups_tried = 1;
- } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
- /* we've tried all the backups, all done */
- return -1;
+ backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
+ backup_index %= BTRFS_NUM_BACKUP_ROOTS;
} else {
- /* jump to the next oldest backup */
- newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
- BTRFS_NUM_BACKUP_ROOTS;
- *backup_index = newest;
- *num_backups_tried += 1;
+ return -EINVAL;
}
- root_backup = super->super_roots + newest;
+
+ root_backup = super->super_roots + backup_index;
btrfs_set_super_generation(super,
btrfs_backup_tree_root_gen(root_backup));
@@ -2005,12 +1927,13 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
/*
- * fixme: the total bytes and num_devices need to match or we should
+ * Fixme: the total bytes and num_devices need to match or we should
* need a fsck
*/
btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
- return 0;
+
+ return backup_index;
}
/* helper to cleanup workers */
@@ -2025,13 +1948,11 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
- btrfs_destroy_workqueue(fs_info->submit_workers);
btrfs_destroy_workqueue(fs_info->delayed_workers);
btrfs_destroy_workqueue(fs_info->caching_workers);
btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
- btrfs_destroy_workqueue(fs_info->extent_workers);
/*
* Now that all other work queues are destroyed, we can safely destroy
* the queues used for metadata I/O, since tasks from those other work
@@ -2052,7 +1973,7 @@ static void free_root_extent_buffers(struct btrfs_root *root)
}
/* helper to cleanup tree roots */
-static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
{
free_root_extent_buffers(info->tree_root);
@@ -2061,7 +1982,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
free_root_extent_buffers(info->csum_root);
free_root_extent_buffers(info->quota_root);
free_root_extent_buffers(info->uuid_root);
- if (chunk_root)
+ if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
free_root_extent_buffers(info->free_space_root);
}
@@ -2138,8 +2059,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
inode->i_mapping->a_ops = &btree_aops;
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
- extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode);
- BTRFS_I(inode)->io_tree.track_uptodate = 0;
+ extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
+ IO_TREE_INODE_IO, inode);
+ BTRFS_I(inode)->io_tree.track_uptodate = false;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
@@ -2162,7 +2084,6 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->qgroup_lock);
mutex_init(&fs_info->qgroup_ioctl_lock);
fs_info->qgroup_tree = RB_ROOT;
- fs_info->qgroup_op_tree = RB_ROOT;
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
fs_info->qgroup_seq = 1;
fs_info->qgroup_ulist = NULL;
@@ -2191,16 +2112,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->caching_workers =
btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
- /*
- * a higher idle thresh on the submit workers makes it much more
- * likely that bios will be send down in a sane order to the
- * devices
- */
- fs_info->submit_workers =
- btrfs_alloc_workqueue(fs_info, "submit", flags,
- min_t(u64, fs_devices->num_devices,
- max_active), 64);
-
fs_info->fixup_workers =
btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
@@ -2237,13 +2148,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
max_active, 2);
fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
- fs_info->extent_workers =
- btrfs_alloc_workqueue(fs_info, "extent-refs", flags,
- min_t(u64, fs_devices->num_devices,
- max_active), 8);
if (!(fs_info->workers && fs_info->delalloc_workers &&
- fs_info->submit_workers && fs_info->flush_workers &&
+ fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->endio_meta_write_workers &&
fs_info->endio_repair_workers &&
@@ -2251,7 +2158,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->readahead_workers &&
fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->extent_workers &&
fs_info->qgroup_rescan_workers)) {
return -ENOMEM;
}
@@ -2259,6 +2165,29 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
return 0;
}
+static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
+{
+ struct crypto_shash *csum_shash;
+ const char *csum_driver = btrfs_super_csum_driver(csum_type);
+
+ csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
+
+ if (IS_ERR(csum_shash)) {
+ btrfs_err(fs_info, "error allocating %s hash for checksum",
+ csum_driver);
+ return PTR_ERR(csum_shash);
+ }
+
+ fs_info->csum_shash = csum_shash;
+
+ return 0;
+}
+
+static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
+{
+ crypto_free_shash(fs_info->csum_shash);
+}
+
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
@@ -2574,7 +2503,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
ret = validate_super(fs_info, sb, -1);
if (ret < 0)
goto out;
- if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) {
+ if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
ret = -EUCLEAN;
btrfs_err(fs_info, "invalid csum type, has %u want %u",
btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
@@ -2595,7 +2524,101 @@ out:
return ret;
}
-int open_ctree(struct super_block *sb,
+static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
+{
+ int backup_index = find_newest_super_backup(fs_info);
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ bool handle_error = false;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+ u64 generation;
+ int level;
+
+ if (handle_error) {
+ if (!IS_ERR(tree_root->node))
+ free_extent_buffer(tree_root->node);
+ tree_root->node = NULL;
+
+ if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
+ break;
+
+ free_root_pointers(fs_info, 0);
+
+ /*
+ * Don't use the log in recovery mode, it won't be
+ * valid
+ */
+ btrfs_set_super_log_root(sb, 0);
+
+ /* We can't trust the free space cache either */
+ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+ ret = read_backup_root(fs_info, i);
+ backup_index = ret;
+ if (ret < 0)
+ return ret;
+ }
+ generation = btrfs_super_generation(sb);
+ level = btrfs_super_root_level(sb);
+ tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
+ generation, level, NULL);
+ if (IS_ERR(tree_root->node) ||
+ !extent_buffer_uptodate(tree_root->node)) {
+ handle_error = true;
+
+ if (IS_ERR(tree_root->node))
+ ret = PTR_ERR(tree_root->node);
+ else if (!extent_buffer_uptodate(tree_root->node))
+ ret = -EUCLEAN;
+
+ btrfs_warn(fs_info, "failed to read tree root");
+ continue;
+ }
+
+ btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+ tree_root->commit_root = btrfs_root_node(tree_root);
+ btrfs_set_root_refs(&tree_root->root_item, 1);
+
+ /*
+ * No need to hold btrfs_root::objectid_mutex since the fs
+ * hasn't been fully initialised and we are the only user
+ */
+ ret = btrfs_find_highest_objectid(tree_root,
+ &tree_root->highest_objectid);
+ if (ret < 0) {
+ handle_error = true;
+ continue;
+ }
+
+ ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+ ret = btrfs_read_roots(fs_info);
+ if (ret < 0) {
+ handle_error = true;
+ continue;
+ }
+
+ /* All successful */
+ fs_info->generation = generation;
+ fs_info->last_trans_committed = generation;
+
+ /* Always begin writing backup roots after the one being used */
+ if (backup_index < 0) {
+ fs_info->backup_root_index = 0;
+ } else {
+ fs_info->backup_root_index = backup_index + 1;
+ fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
+ }
+ break;
+ }
+
+ return ret;
+}
+
+int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options)
{
@@ -2604,6 +2627,7 @@ int open_ctree(struct super_block *sb,
u32 stripesize;
u64 generation;
u64 features;
+ u16 csum_type;
struct btrfs_key location;
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
@@ -2612,8 +2636,6 @@ int open_ctree(struct super_block *sb,
struct btrfs_root *chunk_root;
int ret;
int err = -EINVAL;
- int num_backups_tried = 0;
- int backup_index = 0;
int clear_free_space_tree = 0;
int level;
@@ -2630,11 +2652,17 @@ int open_ctree(struct super_block *sb,
goto fail;
}
- ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_srcu;
}
+
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
+ if (ret) {
+ err = ret;
+ goto fail_dio_bytes;
+ }
fs_info->dirty_metadata_batch = PAGE_SIZE *
(1 + ilog2(nr_cpu_ids));
@@ -2658,8 +2686,6 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->delayed_iputs);
INIT_LIST_HEAD(&fs_info->delalloc_roots);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
- INIT_LIST_HEAD(&fs_info->pending_raid_kobjs);
- spin_lock_init(&fs_info->pending_raid_kobjs_lock);
spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
@@ -2667,7 +2693,6 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->tree_mod_seq_lock);
spin_lock_init(&fs_info->super_lock);
- spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
@@ -2681,7 +2706,7 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
- btrfs_mapping_init(&fs_info->mapping_tree);
+ extent_map_tree_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
@@ -2694,7 +2719,6 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->defrag_running, 0);
- atomic_set(&fs_info->qgroup_op_seq, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
@@ -2748,8 +2772,10 @@ int open_ctree(struct super_block *sb,
fs_info->block_group_cache_tree = RB_ROOT;
fs_info->first_logical_byte = (u64)-1;
- extent_io_tree_init(&fs_info->freed_extents[0], NULL);
- extent_io_tree_init(&fs_info->freed_extents[1], NULL);
+ extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
+ IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
+ extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
+ IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
fs_info->pinned_extents = &fs_info->freed_extents[0];
set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
@@ -2776,8 +2802,6 @@ int open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->async_submit_wait);
init_waitqueue_head(&fs_info->delayed_iputs_wait);
- INIT_LIST_HEAD(&fs_info->pinned_chunks);
-
/* Usable values until the real ones are cached from the superblock */
fs_info->nodesize = 4096;
fs_info->sectorsize = 4096;
@@ -2786,6 +2810,8 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->swapfile_pins_lock);
fs_info->swapfile_pins = RB_ROOT;
+ fs_info->send_in_progress = 0;
+
ret = btrfs_alloc_stripe_hash_table(fs_info);
if (ret) {
err = ret;
@@ -2806,6 +2832,25 @@ int open_ctree(struct super_block *sb,
}
/*
+ * Verify the type first, if that or the the checksum value are
+ * corrupted, we'll find out
+ */
+ csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
+ if (!btrfs_supported_super_csum(csum_type)) {
+ btrfs_err(fs_info, "unsupported checksum algorithm: %u",
+ csum_type);
+ err = -EINVAL;
+ brelse(bh);
+ goto fail_alloc;
+ }
+
+ ret = btrfs_init_csum_hash(fs_info, csum_type);
+ if (ret) {
+ err = ret;
+ goto fail_alloc;
+ }
+
+ /*
* We want to check superblock checksum, the type is stored inside.
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
*/
@@ -2813,7 +2858,7 @@ int open_ctree(struct super_block *sb,
btrfs_err(fs_info, "superblock checksum mismatch");
err = -EINVAL;
brelse(bh);
- goto fail_alloc;
+ goto fail_csum;
}
/*
@@ -2850,24 +2895,17 @@ int open_ctree(struct super_block *sb,
if (ret) {
btrfs_err(fs_info, "superblock contains fatal errors");
err = -EINVAL;
- goto fail_alloc;
+ goto fail_csum;
}
if (!btrfs_super_root(disk_super))
- goto fail_alloc;
+ goto fail_csum;
/* check FS state, whether FS is broken. */
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
/*
- * run through our array of backup supers and setup
- * our ring pointer to the oldest one
- */
- generation = btrfs_super_generation(disk_super);
- find_oldest_super_backup(fs_info, generation);
-
- /*
* In the long term, we'll store the compression type in the super
* block, and it'll be used for per file compression control.
*/
@@ -2876,7 +2914,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_parse_options(fs_info, options, sb->s_flags);
if (ret) {
err = ret;
- goto fail_alloc;
+ goto fail_csum;
}
features = btrfs_super_incompat_flags(disk_super) &
@@ -2886,7 +2924,7 @@ int open_ctree(struct super_block *sb,
"cannot mount because of unsupported optional features (%llx)",
features);
err = -EINVAL;
- goto fail_alloc;
+ goto fail_csum;
}
features = btrfs_super_incompat_flags(disk_super);
@@ -2930,7 +2968,7 @@ int open_ctree(struct super_block *sb,
btrfs_err(fs_info,
"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
nodesize, sectorsize);
- goto fail_alloc;
+ goto fail_csum;
}
/*
@@ -2946,7 +2984,7 @@ int open_ctree(struct super_block *sb,
"cannot mount read-write because of unsupported optional features (%llx)",
features);
err = -EINVAL;
- goto fail_alloc;
+ goto fail_csum;
}
ret = btrfs_init_workqueues(fs_info, fs_devices);
@@ -3013,44 +3051,9 @@ int open_ctree(struct super_block *sb,
goto fail_tree_roots;
}
-retry_root_backup:
- generation = btrfs_super_generation(disk_super);
- level = btrfs_super_root_level(disk_super);
-
- tree_root->node = read_tree_block(fs_info,
- btrfs_super_root(disk_super),
- generation, level, NULL);
- if (IS_ERR(tree_root->node) ||
- !extent_buffer_uptodate(tree_root->node)) {
- btrfs_warn(fs_info, "failed to read tree root");
- if (!IS_ERR(tree_root->node))
- free_extent_buffer(tree_root->node);
- tree_root->node = NULL;
- goto recovery_tree_root;
- }
-
- btrfs_set_root_node(&tree_root->root_item, tree_root->node);
- tree_root->commit_root = btrfs_root_node(tree_root);
- btrfs_set_root_refs(&tree_root->root_item, 1);
-
- mutex_lock(&tree_root->objectid_mutex);
- ret = btrfs_find_highest_objectid(tree_root,
- &tree_root->highest_objectid);
- if (ret) {
- mutex_unlock(&tree_root->objectid_mutex);
- goto recovery_tree_root;
- }
-
- ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
-
- mutex_unlock(&tree_root->objectid_mutex);
-
- ret = btrfs_read_roots(fs_info);
+ ret = init_tree_roots(fs_info);
if (ret)
- goto recovery_tree_root;
-
- fs_info->generation = generation;
- fs_info->last_trans_committed = generation;
+ goto fail_tree_roots;
ret = btrfs_verify_dev_extents(fs_info);
if (ret) {
@@ -3318,12 +3321,14 @@ fail_block_groups:
btrfs_put_block_group_cache(fs_info);
fail_tree_roots:
- free_root_pointers(fs_info, 1);
+ free_root_pointers(fs_info, true);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
+fail_csum:
+ btrfs_free_csum_hash(fs_info);
fail_alloc:
fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -3335,30 +3340,14 @@ fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+fail_dio_bytes:
+ percpu_counter_destroy(&fs_info->dio_bytes);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
btrfs_free_stripe_hash_table(fs_info);
btrfs_close_devices(fs_info->fs_devices);
return err;
-
-recovery_tree_root:
- if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
- goto fail_tree_roots;
-
- free_root_pointers(fs_info, 0);
-
- /* don't use the log in recovery mode, it won't be valid */
- btrfs_set_super_log_root(disk_super, 0);
-
- /* we can't trust the free space cache either */
- btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
-
- ret = next_root_backup(fs_info, fs_info->super_copy,
- &num_backups_tried, &backup_index);
- if (ret == -1)
- goto fail_block_groups;
- goto retry_root_backup;
}
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
@@ -3463,17 +3452,20 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
static int write_dev_supers(struct btrfs_device *device,
struct btrfs_super_block *sb, int max_mirrors)
{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct buffer_head *bh;
int i;
int ret;
int errors = 0;
- u32 crc;
u64 bytenr;
int op_flags;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+ shash->tfm = fs_info->csum_shash;
+
for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
@@ -3482,10 +3474,10 @@ static int write_dev_supers(struct btrfs_device *device,
btrfs_set_super_bytenr(sb, bytenr);
- crc = ~(u32)0;
- crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc,
- BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
- btrfs_csum_final(crc, sb->csum);
+ crypto_shash_init(shash);
+ crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+ crypto_shash_final(shash, sb->csum);
/* One reference for us, and we leave it for the caller */
bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
@@ -3700,7 +3692,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
(flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
- min_tolerated = min(min_tolerated,
+ min_tolerated = min_t(int, min_tolerated,
btrfs_raid_array[BTRFS_RAID_SINGLE].
tolerated_failures);
@@ -3709,7 +3701,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
continue;
if (!(flags & btrfs_raid_array[raid_type].bg_flag))
continue;
- min_tolerated = min(min_tolerated,
+ min_tolerated = min_t(int, min_tolerated,
btrfs_raid_array[raid_type].
tolerated_failures);
}
@@ -3949,7 +3941,7 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
return btrfs_commit_transaction(trans);
}
-void close_ctree(struct btrfs_fs_info *fs_info)
+void __cold close_ctree(struct btrfs_fs_info *fs_info)
{
int ret;
@@ -4016,6 +4008,10 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_sum(&fs_info->delalloc_bytes));
}
+ if (percpu_counter_sum(&fs_info->dio_bytes))
+ btrfs_info(fs_info, "at unmount dio bytes count %lld",
+ percpu_counter_sum(&fs_info->dio_bytes));
+
btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -4033,7 +4029,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
btrfs_free_block_groups(fs_info);
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
- free_root_pointers(fs_info, 1);
+ free_root_pointers(fs_info, true);
iput(fs_info->btree_inode);
@@ -4042,25 +4038,18 @@ void close_ctree(struct btrfs_fs_info *fs_info)
btrfsic_unmount(fs_info->fs_devices);
#endif
- btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_close_devices(fs_info->fs_devices);
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
+ percpu_counter_destroy(&fs_info->dio_bytes);
percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
cleanup_srcu_struct(&fs_info->subvol_srcu);
+ btrfs_free_csum_hash(fs_info);
btrfs_free_stripe_hash_table(fs_info);
btrfs_free_ref_cache(fs_info);
-
- while (!list_empty(&fs_info->pinned_chunks)) {
- struct extent_map *em;
-
- em = list_first_entry(&fs_info->pinned_chunks,
- struct extent_map, list);
- list_del_init(&em->list);
- free_extent_map(em);
- }
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -4114,7 +4103,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
* So here we should only check item pointers, not item data.
*/
if (btrfs_header_level(buf) == 0 &&
- btrfs_check_leaf_relaxed(fs_info, buf)) {
+ btrfs_check_leaf_relaxed(buf)) {
btrfs_print_leaf(buf);
ASSERT(0);
}
@@ -4157,10 +4146,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
struct btrfs_key *first_key)
{
- struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
-
- return btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+ return btree_read_extent_buffer_pages(buf, parent_transid,
level, first_key);
}
@@ -4420,7 +4406,7 @@ again:
return 0;
}
-static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
+static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
{
struct inode *inode;
@@ -4437,12 +4423,12 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
struct btrfs_fs_info *fs_info)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
dirty_list);
if (!list_empty(&cache->io_list)) {
@@ -4470,7 +4456,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
*/
while (!list_empty(&cur_trans->io_bgs)) {
cache = list_first_entry(&cur_trans->io_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
io_list);
list_del_init(&cache->io_list);
@@ -4484,10 +4470,17 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_fs_info *fs_info)
{
+ struct btrfs_device *dev, *tmp;
+
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
ASSERT(list_empty(&cur_trans->dirty_bgs));
ASSERT(list_empty(&cur_trans->io_bgs));
+ list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
+ post_commit_list) {
+ list_del_init(&dev->post_commit_list);
+ }
+
btrfs_destroy_delayed_refs(cur_trans, fs_info);
cur_trans->state = TRANS_STATE_COMMIT_START;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 987a64bc0c66..76f123ebb292 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -39,20 +39,20 @@ static inline u64 btrfs_sb_offset(int mirror)
struct btrfs_device;
struct btrfs_fs_devices;
+int btrfs_verify_level_key(struct extent_buffer *eb, int level,
+ struct btrfs_key *first_key, u64 parent_transid);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 parent_transid, int level,
struct btrfs_key *first_key);
void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr);
-int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
- int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
u64 bytenr);
-void clean_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *buf);
-int open_ctree(struct super_block *sb,
+void btrfs_clean_tree_block(struct extent_buffer *buf);
+int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
-void close_ctree(struct btrfs_fs_info *fs_info);
+void __cold close_ctree(struct btrfs_fs_info *fs_info);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
@@ -113,8 +113,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
struct btrfs_key *first_key);
-u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
-void btrfs_csum_final(u32 crc, u8 *result);
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata);
blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
@@ -123,8 +121,6 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);
-int btrfs_write_tree_block(struct extent_buffer *buf);
-void btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
@@ -134,7 +130,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans,
void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ddf28ecf17f9..72e312cae69d 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -87,7 +87,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(sb, &key, root, NULL);
+ inode = btrfs_iget(sb, &key, root);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail;
@@ -214,7 +214,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root, NULL));
+ return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root));
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
new file mode 100644
index 000000000000..a3febe746c79
--- /dev/null
+++ b/fs/btrfs/extent-io-tree.h
@@ -0,0 +1,248 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_EXTENT_IO_TREE_H
+#define BTRFS_EXTENT_IO_TREE_H
+
+struct extent_changeset;
+struct io_failure_record;
+
+/* Bits for the extent state */
+#define EXTENT_DIRTY (1U << 0)
+#define EXTENT_UPTODATE (1U << 1)
+#define EXTENT_LOCKED (1U << 2)
+#define EXTENT_NEW (1U << 3)
+#define EXTENT_DELALLOC (1U << 4)
+#define EXTENT_DEFRAG (1U << 5)
+#define EXTENT_BOUNDARY (1U << 6)
+#define EXTENT_NODATASUM (1U << 7)
+#define EXTENT_CLEAR_META_RESV (1U << 8)
+#define EXTENT_NEED_WAIT (1U << 9)
+#define EXTENT_DAMAGED (1U << 10)
+#define EXTENT_NORESERVE (1U << 11)
+#define EXTENT_QGROUP_RESERVED (1U << 12)
+#define EXTENT_CLEAR_DATA_RESV (1U << 13)
+#define EXTENT_DELALLOC_NEW (1U << 14)
+#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
+ EXTENT_CLEAR_DATA_RESV)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
+
+/*
+ * Redefined bits above which are used only in the device allocation tree,
+ * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
+ * / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit
+ * manipulation functions
+ */
+#define CHUNK_ALLOCATED EXTENT_DIRTY
+#define CHUNK_TRIMMED EXTENT_DEFRAG
+
+enum {
+ IO_TREE_FS_INFO_FREED_EXTENTS0,
+ IO_TREE_FS_INFO_FREED_EXTENTS1,
+ IO_TREE_INODE_IO,
+ IO_TREE_INODE_IO_FAILURE,
+ IO_TREE_RELOC_BLOCKS,
+ IO_TREE_TRANS_DIRTY_PAGES,
+ IO_TREE_ROOT_DIRTY_LOG_PAGES,
+ IO_TREE_SELFTEST,
+};
+
+struct extent_io_tree {
+ struct rb_root state;
+ struct btrfs_fs_info *fs_info;
+ void *private_data;
+ u64 dirty_bytes;
+ bool track_uptodate;
+
+ /* Who owns this io tree, should be one of IO_TREE_* */
+ u8 owner;
+
+ spinlock_t lock;
+ const struct extent_io_ops *ops;
+};
+
+struct extent_state {
+ u64 start;
+ u64 end; /* inclusive */
+ struct rb_node rb_node;
+
+ /* ADD NEW ELEMENTS AFTER THIS */
+ wait_queue_head_t wq;
+ refcount_t refs;
+ unsigned state;
+
+ struct io_failure_record *failrec;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct list_head leak_list;
+#endif
+};
+
+int __init extent_state_cache_init(void);
+void __cold extent_state_cache_exit(void);
+
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner,
+ void *private_data);
+void extent_io_tree_release(struct extent_io_tree *tree);
+
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached);
+
+static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return lock_extent_bits(tree, start, end, NULL);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+
+int __init extent_io_init(void);
+void __cold extent_io_exit(void);
+
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end,
+ u64 max_bytes, unsigned bits, int contig);
+
+void free_extent_state(struct extent_state *state);
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int filled,
+ struct extent_state *cached_state);
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, struct extent_changeset *changeset);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached);
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached, gfp_t mask,
+ struct extent_changeset *changeset);
+
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
+}
+
+static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ GFP_NOFS, NULL);
+}
+
+static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
+ u64 start, u64 end, struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ GFP_ATOMIC, NULL);
+}
+
+static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits)
+{
+ int wake = 0;
+
+ if (bits & EXTENT_LOCKED)
+ wake = 1;
+
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
+}
+
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, struct extent_changeset *changeset);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask);
+int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits);
+
+static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
+}
+
+static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ cached_state, GFP_NOFS, NULL);
+}
+
+static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
+ NULL, mask);
+}
+
+static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, cached);
+}
+
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, unsigned clear_bits,
+ struct extent_state **cached_state);
+
+static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned int extra_bits,
+ struct extent_state **cached_state)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
+ NULL, cached_state, GFP_NOFS);
+}
+
+static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+ NULL, cached_state, GFP_NOFS);
+}
+
+static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
+ u64 end)
+{
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
+ GFP_NOFS);
+}
+
+static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+ cached_state, mask);
+}
+
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits,
+ struct extent_state **cached_state);
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits);
+int extent_invalidatepage(struct extent_io_tree *tree,
+ struct page *page, unsigned long offset);
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state);
+
+/* This should be reworked in the future and put elsewhere. */
+int get_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record **failrec);
+int set_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record *failrec);
+void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
+ u64 end);
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+ struct io_failure_record **failrec_ret);
+int free_io_failure(struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree,
+ struct io_failure_record *rec);
+int clean_io_failure(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree, u64 start,
+ struct page *page, u64 ino, unsigned int pg_offset);
+
+#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c5880329ae37..274318e9114e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -16,6 +16,7 @@
#include <linux/percpu_counter.h>
#include <linux/lockdep.h>
#include <linux/crc32c.h>
+#include "misc.h"
#include "tree-log.h"
#include "disk-io.h"
#include "print-tree.h"
@@ -24,50 +25,16 @@
#include "locking.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "math.h"
#include "sysfs.h"
#include "qgroup.h"
#include "ref-verify.h"
+#include "space-info.h"
+#include "block-rsv.h"
+#include "delalloc-space.h"
+#include "block-group.h"
#undef SCRAMBLE_DELAYED_REFS
-/*
- * control flags for do_chunk_alloc's force field
- * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
- * if we really need one.
- *
- * CHUNK_ALLOC_LIMITED means to only try and allocate one
- * if we have very few chunks already allocated. This is
- * used as part of the clustering code to help make sure
- * we have a good pool of storage to cluster in, without
- * filling the FS with empty chunks
- *
- * CHUNK_ALLOC_FORCE means it must try to allocate one
- *
- */
-enum {
- CHUNK_ALLOC_NO_FORCE = 0,
- CHUNK_ALLOC_LIMITED = 1,
- CHUNK_ALLOC_FORCE = 2,
-};
-
-/*
- * Declare a helper function to detect underflow of various space info members
- */
-#define DECLARE_SPACE_INFO_UPDATE(name) \
-static inline void update_##name(struct btrfs_space_info *sinfo, \
- s64 bytes) \
-{ \
- if (bytes < 0 && sinfo->name < -bytes) { \
- WARN_ON(1); \
- sinfo->name = 0; \
- return; \
- } \
- sinfo->name += bytes; \
-}
-
-DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
-DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node, u64 parent,
@@ -84,148 +51,16 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op);
-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
- int force);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
-static void dump_space_info(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *info, u64 bytes,
- int dump_block_groups);
-static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
- u64 num_bytes);
-static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 num_bytes);
-static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 num_bytes);
-
-static noinline int
-block_group_cache_done(struct btrfs_block_group_cache *cache)
-{
- smp_mb();
- return cache->cached == BTRFS_CACHE_FINISHED ||
- cache->cached == BTRFS_CACHE_ERROR;
-}
-static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
{
return (cache->flags & bits) == bits;
}
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
-{
- atomic_inc(&cache->count);
-}
-
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
-{
- if (atomic_dec_and_test(&cache->count)) {
- WARN_ON(cache->pinned > 0);
- WARN_ON(cache->reserved > 0);
-
- /*
- * If not empty, someone is still holding mutex of
- * full_stripe_lock, which can only be released by caller.
- * And it will definitely cause use-after-free when caller
- * tries to release full stripe lock.
- *
- * No better way to resolve, but only to warn.
- */
- WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
- kfree(cache->free_space_ctl);
- kfree(cache);
- }
-}
-
-/*
- * this adds the block group to the fs_info rb tree for the block group
- * cache
- */
-static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
- struct btrfs_block_group_cache *block_group)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct btrfs_block_group_cache *cache;
-
- spin_lock(&info->block_group_cache_lock);
- p = &info->block_group_cache_tree.rb_node;
-
- while (*p) {
- parent = *p;
- cache = rb_entry(parent, struct btrfs_block_group_cache,
- cache_node);
- if (block_group->key.objectid < cache->key.objectid) {
- p = &(*p)->rb_left;
- } else if (block_group->key.objectid > cache->key.objectid) {
- p = &(*p)->rb_right;
- } else {
- spin_unlock(&info->block_group_cache_lock);
- return -EEXIST;
- }
- }
-
- rb_link_node(&block_group->cache_node, parent, p);
- rb_insert_color(&block_group->cache_node,
- &info->block_group_cache_tree);
-
- if (info->first_logical_byte > block_group->key.objectid)
- info->first_logical_byte = block_group->key.objectid;
-
- spin_unlock(&info->block_group_cache_lock);
-
- return 0;
-}
-
-/*
- * This will return the block group at or after bytenr if contains is 0, else
- * it will return the block group that contains the bytenr
- */
-static struct btrfs_block_group_cache *
-block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
- int contains)
-{
- struct btrfs_block_group_cache *cache, *ret = NULL;
- struct rb_node *n;
- u64 end, start;
-
- spin_lock(&info->block_group_cache_lock);
- n = info->block_group_cache_tree.rb_node;
-
- while (n) {
- cache = rb_entry(n, struct btrfs_block_group_cache,
- cache_node);
- end = cache->key.objectid + cache->key.offset - 1;
- start = cache->key.objectid;
-
- if (bytenr < start) {
- if (!contains && (!ret || start < ret->key.objectid))
- ret = cache;
- n = n->rb_left;
- } else if (bytenr > start) {
- if (contains && bytenr <= end) {
- ret = cache;
- break;
- }
- n = n->rb_right;
- } else {
- ret = cache;
- break;
- }
- }
- if (ret) {
- btrfs_get_block_group(ret);
- if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
- info->first_logical_byte = ret->key.objectid;
- }
- spin_unlock(&info->block_group_cache_lock);
-
- return ret;
-}
-
-static int add_excluded_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 num_bytes)
+int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 num_bytes)
{
u64 end = start + num_bytes - 1;
set_extent_bits(&fs_info->freed_extents[0],
@@ -235,13 +70,13 @@ static int add_excluded_extent(struct btrfs_fs_info *fs_info,
return 0;
}
-static void free_excluded_extents(struct btrfs_block_group_cache *cache)
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
u64 start, end;
- start = cache->key.objectid;
- end = start + cache->key.offset - 1;
+ start = cache->start;
+ end = start + cache->length - 1;
clear_extent_bits(&fs_info->freed_extents[0],
start, end, EXTENT_UPTODATE);
@@ -249,547 +84,39 @@ static void free_excluded_extents(struct btrfs_block_group_cache *cache)
start, end, EXTENT_UPTODATE);
}
-static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
+static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
{
- struct btrfs_fs_info *fs_info = cache->fs_info;
- u64 bytenr;
- u64 *logical;
- int stripe_len;
- int i, nr, ret;
-
- if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
- stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
- cache->bytes_super += stripe_len;
- ret = add_excluded_extent(fs_info, cache->key.objectid,
- stripe_len);
- if (ret)
- return ret;
- }
-
- for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
- bytenr = btrfs_sb_offset(i);
- ret = btrfs_rmap_block(fs_info, cache->key.objectid,
- bytenr, &logical, &nr, &stripe_len);
- if (ret)
- return ret;
-
- while (nr--) {
- u64 start, len;
-
- if (logical[nr] > cache->key.objectid +
- cache->key.offset)
- continue;
-
- if (logical[nr] + stripe_len <= cache->key.objectid)
- continue;
-
- start = logical[nr];
- if (start < cache->key.objectid) {
- start = cache->key.objectid;
- len = (logical[nr] + stripe_len) - start;
- } else {
- len = min_t(u64, stripe_len,
- cache->key.objectid +
- cache->key.offset - start);
- }
-
- cache->bytes_super += len;
- ret = add_excluded_extent(fs_info, start, len);
- if (ret) {
- kfree(logical);
- return ret;
- }
- }
-
- kfree(logical);
- }
- return 0;
-}
-
-static struct btrfs_caching_control *
-get_caching_control(struct btrfs_block_group_cache *cache)
-{
- struct btrfs_caching_control *ctl;
-
- spin_lock(&cache->lock);
- if (!cache->caching_ctl) {
- spin_unlock(&cache->lock);
- return NULL;
- }
-
- ctl = cache->caching_ctl;
- refcount_inc(&ctl->count);
- spin_unlock(&cache->lock);
- return ctl;
-}
-
-static void put_caching_control(struct btrfs_caching_control *ctl)
-{
- if (refcount_dec_and_test(&ctl->count))
- kfree(ctl);
-}
-
-#ifdef CONFIG_BTRFS_DEBUG
-static void fragment_free_space(struct btrfs_block_group_cache *block_group)
-{
- struct btrfs_fs_info *fs_info = block_group->fs_info;
- u64 start = block_group->key.objectid;
- u64 len = block_group->key.offset;
- u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
- fs_info->nodesize : fs_info->sectorsize;
- u64 step = chunk << 1;
-
- while (len > chunk) {
- btrfs_remove_free_space(block_group, start, chunk);
- start += step;
- if (len < step)
- len = 0;
+ if (ref->type == BTRFS_REF_METADATA) {
+ if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
+ return BTRFS_BLOCK_GROUP_SYSTEM;
else
- len -= step;
- }
-}
-#endif
-
-/*
- * this is only called by cache_block_group, since we could have freed extents
- * we need to check the pinned_extents for any extents that can't be used yet
- * since their free space will be released as soon as the transaction commits.
- */
-u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- u64 start, u64 end)
-{
- struct btrfs_fs_info *info = block_group->fs_info;
- u64 extent_start, extent_end, size, total_added = 0;
- int ret;
-
- while (start < end) {
- ret = find_first_extent_bit(info->pinned_extents, start,
- &extent_start, &extent_end,
- EXTENT_DIRTY | EXTENT_UPTODATE,
- NULL);
- if (ret)
- break;
-
- if (extent_start <= start) {
- start = extent_end + 1;
- } else if (extent_start > start && extent_start < end) {
- size = extent_start - start;
- total_added += size;
- ret = btrfs_add_free_space(block_group, start,
- size);
- BUG_ON(ret); /* -ENOMEM or logic error */
- start = extent_end + 1;
- } else {
- break;
- }
- }
-
- if (start < end) {
- size = end - start;
- total_added += size;
- ret = btrfs_add_free_space(block_group, start, size);
- BUG_ON(ret); /* -ENOMEM or logic error */
- }
-
- return total_added;
-}
-
-static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
-{
- struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
- struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
- struct btrfs_path *path;
- struct extent_buffer *leaf;
- struct btrfs_key key;
- u64 total_found = 0;
- u64 last = 0;
- u32 nritems;
- int ret;
- bool wakeup = true;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
-
-#ifdef CONFIG_BTRFS_DEBUG
- /*
- * If we're fragmenting we don't want to make anybody think we can
- * allocate from this block group until we've had a chance to fragment
- * the free space.
- */
- if (btrfs_should_fragment_free_space(block_group))
- wakeup = false;
-#endif
- /*
- * We don't want to deadlock with somebody trying to allocate a new
- * extent for the extent root while also trying to search the extent
- * root to add free space. So we skip locking and search the commit
- * root, since its read-only
- */
- path->skip_locking = 1;
- path->search_commit_root = 1;
- path->reada = READA_FORWARD;
-
- key.objectid = last;
- key.offset = 0;
- key.type = BTRFS_EXTENT_ITEM_KEY;
-
-next:
- ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
-
- while (1) {
- if (btrfs_fs_closing(fs_info) > 1) {
- last = (u64)-1;
- break;
- }
-
- if (path->slots[0] < nritems) {
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- } else {
- ret = find_next_key(path, 0, &key);
- if (ret)
- break;
-
- if (need_resched() ||
- rwsem_is_contended(&fs_info->commit_root_sem)) {
- if (wakeup)
- caching_ctl->progress = last;
- btrfs_release_path(path);
- up_read(&fs_info->commit_root_sem);
- mutex_unlock(&caching_ctl->mutex);
- cond_resched();
- mutex_lock(&caching_ctl->mutex);
- down_read(&fs_info->commit_root_sem);
- goto next;
- }
-
- ret = btrfs_next_leaf(extent_root, path);
- if (ret < 0)
- goto out;
- if (ret)
- break;
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- continue;
- }
-
- if (key.objectid < last) {
- key.objectid = last;
- key.offset = 0;
- key.type = BTRFS_EXTENT_ITEM_KEY;
-
- if (wakeup)
- caching_ctl->progress = last;
- btrfs_release_path(path);
- goto next;
- }
-
- if (key.objectid < block_group->key.objectid) {
- path->slots[0]++;
- continue;
- }
-
- if (key.objectid >= block_group->key.objectid +
- block_group->key.offset)
- break;
-
- if (key.type == BTRFS_EXTENT_ITEM_KEY ||
- key.type == BTRFS_METADATA_ITEM_KEY) {
- total_found += add_new_free_space(block_group, last,
- key.objectid);
- if (key.type == BTRFS_METADATA_ITEM_KEY)
- last = key.objectid +
- fs_info->nodesize;
- else
- last = key.objectid + key.offset;
-
- if (total_found > CACHING_CTL_WAKE_UP) {
- total_found = 0;
- if (wakeup)
- wake_up(&caching_ctl->wait);
- }
- }
- path->slots[0]++;
- }
- ret = 0;
-
- total_found += add_new_free_space(block_group, last,
- block_group->key.objectid +
- block_group->key.offset);
- caching_ctl->progress = (u64)-1;
-
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static noinline void caching_thread(struct btrfs_work *work)
-{
- struct btrfs_block_group_cache *block_group;
- struct btrfs_fs_info *fs_info;
- struct btrfs_caching_control *caching_ctl;
- int ret;
-
- caching_ctl = container_of(work, struct btrfs_caching_control, work);
- block_group = caching_ctl->block_group;
- fs_info = block_group->fs_info;
-
- mutex_lock(&caching_ctl->mutex);
- down_read(&fs_info->commit_root_sem);
-
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
- ret = load_free_space_tree(caching_ctl);
- else
- ret = load_extent_tree_free(caching_ctl);
-
- spin_lock(&block_group->lock);
- block_group->caching_ctl = NULL;
- block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
- spin_unlock(&block_group->lock);
-
-#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_should_fragment_free_space(block_group)) {
- u64 bytes_used;
-
- spin_lock(&block_group->space_info->lock);
- spin_lock(&block_group->lock);
- bytes_used = block_group->key.offset -
- btrfs_block_group_used(&block_group->item);
- block_group->space_info->bytes_used += bytes_used >> 1;
- spin_unlock(&block_group->lock);
- spin_unlock(&block_group->space_info->lock);
- fragment_free_space(block_group);
+ return BTRFS_BLOCK_GROUP_METADATA;
}
-#endif
-
- caching_ctl->progress = (u64)-1;
-
- up_read(&fs_info->commit_root_sem);
- free_excluded_extents(block_group);
- mutex_unlock(&caching_ctl->mutex);
-
- wake_up(&caching_ctl->wait);
-
- put_caching_control(caching_ctl);
- btrfs_put_block_group(block_group);
+ return BTRFS_BLOCK_GROUP_DATA;
}
-static int cache_block_group(struct btrfs_block_group_cache *cache,
- int load_cache_only)
-{
- DEFINE_WAIT(wait);
- struct btrfs_fs_info *fs_info = cache->fs_info;
- struct btrfs_caching_control *caching_ctl;
- int ret = 0;
-
- caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
- if (!caching_ctl)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&caching_ctl->list);
- mutex_init(&caching_ctl->mutex);
- init_waitqueue_head(&caching_ctl->wait);
- caching_ctl->block_group = cache;
- caching_ctl->progress = cache->key.objectid;
- refcount_set(&caching_ctl->count, 1);
- btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
- caching_thread, NULL, NULL);
-
- spin_lock(&cache->lock);
- /*
- * This should be a rare occasion, but this could happen I think in the
- * case where one thread starts to load the space cache info, and then
- * some other thread starts a transaction commit which tries to do an
- * allocation while the other thread is still loading the space cache
- * info. The previous loop should have kept us from choosing this block
- * group, but if we've moved to the state where we will wait on caching
- * block groups we need to first check if we're doing a fast load here,
- * so we can wait for it to finish, otherwise we could end up allocating
- * from a block group who's cache gets evicted for one reason or
- * another.
- */
- while (cache->cached == BTRFS_CACHE_FAST) {
- struct btrfs_caching_control *ctl;
-
- ctl = cache->caching_ctl;
- refcount_inc(&ctl->count);
- prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&cache->lock);
-
- schedule();
-
- finish_wait(&ctl->wait, &wait);
- put_caching_control(ctl);
- spin_lock(&cache->lock);
- }
-
- if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
- kfree(caching_ctl);
- return 0;
- }
- WARN_ON(cache->caching_ctl);
- cache->caching_ctl = caching_ctl;
- cache->cached = BTRFS_CACHE_FAST;
- spin_unlock(&cache->lock);
-
- if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
- mutex_lock(&caching_ctl->mutex);
- ret = load_free_space_cache(fs_info, cache);
-
- spin_lock(&cache->lock);
- if (ret == 1) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_FINISHED;
- cache->last_byte_to_unpin = (u64)-1;
- caching_ctl->progress = (u64)-1;
- } else {
- if (load_cache_only) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_NO;
- } else {
- cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
- }
- }
- spin_unlock(&cache->lock);
-#ifdef CONFIG_BTRFS_DEBUG
- if (ret == 1 &&
- btrfs_should_fragment_free_space(cache)) {
- u64 bytes_used;
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- bytes_used = cache->key.offset -
- btrfs_block_group_used(&cache->item);
- cache->space_info->bytes_used += bytes_used >> 1;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- fragment_free_space(cache);
- }
-#endif
- mutex_unlock(&caching_ctl->mutex);
-
- wake_up(&caching_ctl->wait);
- if (ret == 1) {
- put_caching_control(caching_ctl);
- free_excluded_extents(cache);
- return 0;
- }
- } else {
- /*
- * We're either using the free space tree or no caching at all.
- * Set cached to the appropriate value and wakeup any waiters.
- */
- spin_lock(&cache->lock);
- if (load_cache_only) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_NO;
- } else {
- cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
- }
- spin_unlock(&cache->lock);
- wake_up(&caching_ctl->wait);
- }
-
- if (load_cache_only) {
- put_caching_control(caching_ctl);
- return 0;
- }
-
- down_write(&fs_info->commit_root_sem);
- refcount_inc(&caching_ctl->count);
- list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- up_write(&fs_info->commit_root_sem);
-
- btrfs_get_block_group(cache);
-
- btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
-
- return ret;
-}
-
-/*
- * return the block group that starts at or after bytenr
- */
-static struct btrfs_block_group_cache *
-btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
-{
- return block_group_cache_tree_search(info, bytenr, 0);
-}
-
-/*
- * return the block group that contains the given bytenr
- */
-struct btrfs_block_group_cache *btrfs_lookup_block_group(
- struct btrfs_fs_info *info,
- u64 bytenr)
-{
- return block_group_cache_tree_search(info, bytenr, 1);
-}
-
-static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
- u64 flags)
-{
- struct list_head *head = &info->space_info;
- struct btrfs_space_info *found;
-
- flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
-
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
- if (found->flags & flags) {
- rcu_read_unlock();
- return found;
- }
- }
- rcu_read_unlock();
- return NULL;
-}
-
-static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
- bool metadata, u64 root_objectid)
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_ref *ref)
{
struct btrfs_space_info *space_info;
- u64 flags;
-
- if (metadata) {
- if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
- else
- flags = BTRFS_BLOCK_GROUP_METADATA;
- } else {
- flags = BTRFS_BLOCK_GROUP_DATA;
- }
+ u64 flags = generic_ref_to_space_flags(ref);
- space_info = __find_space_info(fs_info, flags);
+ space_info = btrfs_find_space_info(fs_info, flags);
ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
+ percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
BTRFS_TOTAL_BYTES_PINNED_BATCH);
}
-/*
- * after adding space to the filesystem, we need to clear the full flags
- * on all the space infos.
- */
-void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_ref *ref)
{
- struct list_head *head = &info->space_info;
- struct btrfs_space_info *found;
+ struct btrfs_space_info *space_info;
+ u64 flags = generic_ref_to_space_flags(ref);
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list)
- found->full = 0;
- rcu_read_unlock();
+ space_info = btrfs_find_space_info(fs_info, flags);
+ ASSERT(space_info);
+ percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH);
}
/* simple helper to search for an existing data extent at a given offset */
@@ -1111,18 +438,18 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
return BTRFS_REF_TYPE_INVALID;
}
-static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
+u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
{
u32 high_crc = ~(u32)0;
u32 low_crc = ~(u32)0;
__le64 lenum;
lenum = cpu_to_le64(root_objectid);
- high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+ high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(owner);
- low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(offset);
- low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
@@ -1704,7 +1031,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
type = extent_ref_type(parent, owner);
size = btrfs_extent_inline_ref_size(type);
- btrfs_extend_item(fs_info, path, size);
+ btrfs_extend_item(path, size);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
@@ -1779,7 +1106,6 @@ void update_inline_extent_backref(struct btrfs_path *path,
int *last_ref)
{
struct extent_buffer *leaf = path->nodes[0];
- struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_extent_item *ei;
struct btrfs_extent_data_ref *dref = NULL;
struct btrfs_shared_data_ref *sref = NULL;
@@ -1834,7 +1160,7 @@ void update_inline_extent_backref(struct btrfs_path *path,
memmove_extent_buffer(leaf, ptr, ptr + size,
end - ptr - size);
item_size -= size;
- btrfs_truncate_item(fs_info, path, item_size, 1);
+ btrfs_truncate_item(path, item_size, 1);
}
btrfs_mark_buffer_dirty(leaf);
}
@@ -1905,7 +1231,6 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
u64 *discarded_bytes)
{
@@ -1981,8 +1306,10 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes)
{
- int ret;
+ int ret = 0;
u64 discarded_bytes = 0;
+ u64 end = bytenr + num_bytes;
+ u64 cur = bytenr;
struct btrfs_bio *bbio = NULL;
@@ -1991,15 +1318,23 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
* associated to its stripes that don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
- /* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
- &bbio, 0);
- /* Error condition is -ENOMEM */
- if (!ret) {
- struct btrfs_bio_stripe *stripe = bbio->stripes;
+ while (cur < end) {
+ struct btrfs_bio_stripe *stripe;
int i;
+ num_bytes = end - cur;
+ /* Tell the block device(s) that the sectors can be discarded */
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
+ &num_bytes, &bbio, 0);
+ /*
+ * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
+ * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
+ * thus we can't continue anyway.
+ */
+ if (ret < 0)
+ goto out;
+ stripe = bbio->stripes;
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
struct request_queue *req_q;
@@ -2016,10 +1351,19 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
stripe->physical,
stripe->length,
&bytes);
- if (!ret)
+ if (!ret) {
discarded_bytes += bytes;
- else if (ret != -EOPNOTSUPP)
- break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
+ } else if (ret != -EOPNOTSUPP) {
+ /*
+ * Logic errors or -ENOMEM, or -EIO, but
+ * unlikely to happen.
+ *
+ * And since there are two loops, explicitly
+ * go to out to avoid confusion.
+ */
+ btrfs_put_bbio(bbio);
+ goto out;
+ }
/*
* Just in case we get back EOPNOTSUPP for some reason,
@@ -2029,7 +1373,9 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
ret = 0;
}
btrfs_put_bbio(bbio);
+ cur += num_bytes;
}
+out:
btrfs_bio_counter_dec(fs_info);
if (actual_bytes)
@@ -2043,39 +1389,28 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
/* Can return -ENOMEM */
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset)
+ struct btrfs_ref *generic_ref)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int old_ref_mod, new_ref_mod;
int ret;
- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
- root_objectid == BTRFS_TREE_LOG_OBJECTID);
-
- btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
- owner, offset, BTRFS_ADD_DELAYED_REF);
+ ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
+ generic_ref->action);
+ BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
+ generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(trans, bytenr,
- num_bytes, parent,
- root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL,
- &old_ref_mod, &new_ref_mod);
- } else {
- ret = btrfs_add_delayed_data_ref(trans, bytenr,
- num_bytes, parent,
- root_objectid, owner, offset,
- 0, BTRFS_ADD_DELAYED_REF,
+ if (generic_ref->type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
+ NULL, &old_ref_mod, &new_ref_mod);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
&old_ref_mod, &new_ref_mod);
- }
- if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
- bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+ btrfs_ref_tree_mod(fs_info, generic_ref);
- add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
- }
+ if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
+ sub_pinned_bytes(fs_info, generic_ref);
return ret;
}
@@ -2472,7 +1807,7 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
flags = BTRFS_BLOCK_GROUP_SYSTEM;
else
flags = BTRFS_BLOCK_GROUP_METADATA;
- space_info = __find_space_info(fs_info, flags);
+ space_info = btrfs_find_space_info(fs_info, flags);
ASSERT(space_info);
percpu_counter_add_batch(&space_info->total_bytes_pinned,
-head->num_bytes,
@@ -2534,8 +1869,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
btrfs_pin_extent(fs_info, head->bytenr,
head->num_bytes, 1);
if (head->is_data) {
- ret = btrfs_del_csums(trans, fs_info, head->bytenr,
- head->num_bytes);
+ ret = btrfs_del_csums(trans, fs_info->csum_root,
+ head->bytenr, head->num_bytes);
}
}
@@ -2834,140 +2169,6 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
return num_csums;
}
-bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- bool ret = false;
- u64 reserved;
-
- spin_lock(&global_rsv->lock);
- reserved = global_rsv->reserved;
- spin_unlock(&global_rsv->lock);
-
- /*
- * Since the global reserve is just kind of magic we don't really want
- * to rely on it to save our bacon, so if our size is more than the
- * delayed_refs_rsv and the global rsv then it's time to think about
- * bailing.
- */
- spin_lock(&delayed_refs_rsv->lock);
- reserved += delayed_refs_rsv->reserved;
- if (delayed_refs_rsv->size >= reserved)
- ret = true;
- spin_unlock(&delayed_refs_rsv->lock);
- return ret;
-}
-
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
-{
- u64 num_entries =
- atomic_read(&trans->transaction->delayed_refs.num_entries);
- u64 avg_runtime;
- u64 val;
-
- smp_mb();
- avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
- val = num_entries * avg_runtime;
- if (val >= NSEC_PER_SEC)
- return 1;
- if (val >= NSEC_PER_SEC / 2)
- return 2;
-
- return btrfs_check_space_for_delayed_refs(trans->fs_info);
-}
-
-struct async_delayed_refs {
- struct btrfs_root *root;
- u64 transid;
- int count;
- int error;
- int sync;
- struct completion wait;
- struct btrfs_work work;
-};
-
-static inline struct async_delayed_refs *
-to_async_delayed_refs(struct btrfs_work *work)
-{
- return container_of(work, struct async_delayed_refs, work);
-}
-
-static void delayed_ref_async_start(struct btrfs_work *work)
-{
- struct async_delayed_refs *async = to_async_delayed_refs(work);
- struct btrfs_trans_handle *trans;
- struct btrfs_fs_info *fs_info = async->root->fs_info;
- int ret;
-
- /* if the commit is already started, we don't need to wait here */
- if (btrfs_transaction_blocked(fs_info))
- goto done;
-
- trans = btrfs_join_transaction(async->root);
- if (IS_ERR(trans)) {
- async->error = PTR_ERR(trans);
- goto done;
- }
-
- /*
- * trans->sync means that when we call end_transaction, we won't
- * wait on delayed refs
- */
- trans->sync = true;
-
- /* Don't bother flushing if we got into a different transaction */
- if (trans->transid > async->transid)
- goto end;
-
- ret = btrfs_run_delayed_refs(trans, async->count);
- if (ret)
- async->error = ret;
-end:
- ret = btrfs_end_transaction(trans);
- if (ret && !async->error)
- async->error = ret;
-done:
- if (async->sync)
- complete(&async->wait);
- else
- kfree(async);
-}
-
-int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
- unsigned long count, u64 transid, int wait)
-{
- struct async_delayed_refs *async;
- int ret;
-
- async = kmalloc(sizeof(*async), GFP_NOFS);
- if (!async)
- return -ENOMEM;
-
- async->root = fs_info->tree_root;
- async->count = count;
- async->error = 0;
- async->transid = transid;
- if (wait)
- async->sync = 1;
- else
- async->sync = 0;
- init_completion(&async->wait);
-
- btrfs_init_work(&async->work, btrfs_extent_refs_helper,
- delayed_ref_async_start, NULL, NULL);
-
- btrfs_queue_work(fs_info->extent_workers, &async->work);
-
- if (wait) {
- wait_for_completion(&async->wait);
- ret = async->error;
- kfree(async);
- return ret;
- }
- return 0;
-}
-
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
@@ -3036,7 +2237,6 @@ out:
}
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 flags,
int level, int is_data)
{
@@ -3053,8 +2253,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->is_data = is_data ? true : false;
extent_op->level = level;
- ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
- num_bytes, extent_op);
+ ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
if (ret)
btrfs_free_delayed_extent_op(extent_op);
return ret;
@@ -3179,16 +2378,19 @@ static noinline int check_committed_ref(struct btrfs_root *root,
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ /* If extent item has more than 1 inline ref then it's shared */
if (item_size != sizeof(*ei) +
btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
goto out;
+ /* If extent created before last snapshot => it's definitely shared */
if (btrfs_extent_generation(leaf, ei) <=
btrfs_root_last_snapshot(&root->root_item))
goto out;
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ /* If this extent has SHARED_DATA_REF then it's shared */
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
if (type != BTRFS_EXTENT_DATA_REF_KEY)
goto out;
@@ -3246,13 +2448,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
u32 nritems;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
+ struct btrfs_ref generic_ref = { 0 };
+ bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
int i;
+ int action;
int level;
int ret = 0;
- int (*process_func)(struct btrfs_trans_handle *,
- struct btrfs_root *,
- u64, u64, u64, u64, u64, u64);
-
if (btrfs_is_testing(fs_info))
return 0;
@@ -3264,15 +2465,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
return 0;
- if (inc)
- process_func = btrfs_inc_extent_ref;
- else
- process_func = btrfs_free_extent;
-
if (full_backref)
parent = buf->start;
else
parent = 0;
+ if (inc)
+ action = BTRFS_ADD_DELAYED_REF;
+ else
+ action = BTRFS_DROP_DELAYED_REF;
for (i = 0; i < nritems; i++) {
if (level == 0) {
@@ -3290,16 +2490,30 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
key.offset -= btrfs_file_extent_offset(buf, fi);
- ret = process_func(trans, root, bytenr, num_bytes,
- parent, ref_root, key.objectid,
- key.offset);
+ btrfs_init_generic_ref(&generic_ref, action, bytenr,
+ num_bytes, parent);
+ generic_ref.real_root = root->root_key.objectid;
+ btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
+ key.offset);
+ generic_ref.skip_qgroup = for_reloc;
+ if (inc)
+ ret = btrfs_inc_extent_ref(trans, &generic_ref);
+ else
+ ret = btrfs_free_extent(trans, &generic_ref);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = fs_info->nodesize;
- ret = process_func(trans, root, bytenr, num_bytes,
- parent, ref_root, level - 1, 0);
+ btrfs_init_generic_ref(&generic_ref, action, bytenr,
+ num_bytes, parent);
+ generic_ref.real_root = root->root_key.objectid;
+ btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
+ generic_ref.skip_qgroup = for_reloc;
+ if (inc)
+ ret = btrfs_inc_extent_ref(trans, &generic_ref);
+ else
+ ret = btrfs_free_extent(trans, &generic_ref);
if (ret)
goto fail;
}
@@ -3321,561 +2535,9 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
}
-static int write_one_cache_group(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- struct btrfs_block_group_cache *cache)
-{
- int ret;
- struct btrfs_root *extent_root = fs_info->extent_root;
- unsigned long bi;
- struct extent_buffer *leaf;
-
- ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
- if (ret) {
- if (ret > 0)
- ret = -ENOENT;
- goto fail;
- }
-
- leaf = path->nodes[0];
- bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
- write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
- btrfs_mark_buffer_dirty(leaf);
-fail:
- btrfs_release_path(path);
- return ret;
-
-}
-
-static struct btrfs_block_group_cache *
-next_block_group(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache)
-{
- struct rb_node *node;
-
- spin_lock(&fs_info->block_group_cache_lock);
-
- /* If our block group was removed, we need a full search. */
- if (RB_EMPTY_NODE(&cache->cache_node)) {
- const u64 next_bytenr = cache->key.objectid + cache->key.offset;
-
- spin_unlock(&fs_info->block_group_cache_lock);
- btrfs_put_block_group(cache);
- cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
- }
- node = rb_next(&cache->cache_node);
- btrfs_put_block_group(cache);
- if (node) {
- cache = rb_entry(node, struct btrfs_block_group_cache,
- cache_node);
- btrfs_get_block_group(cache);
- } else
- cache = NULL;
- spin_unlock(&fs_info->block_group_cache_lock);
- return cache;
-}
-
-static int cache_save_setup(struct btrfs_block_group_cache *block_group,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path)
-{
- struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *root = fs_info->tree_root;
- struct inode *inode = NULL;
- struct extent_changeset *data_reserved = NULL;
- u64 alloc_hint = 0;
- int dcs = BTRFS_DC_ERROR;
- u64 num_pages = 0;
- int retries = 0;
- int ret = 0;
-
- /*
- * If this block group is smaller than 100 megs don't bother caching the
- * block group.
- */
- if (block_group->key.offset < (100 * SZ_1M)) {
- spin_lock(&block_group->lock);
- block_group->disk_cache_state = BTRFS_DC_WRITTEN;
- spin_unlock(&block_group->lock);
- return 0;
- }
-
- if (trans->aborted)
- return 0;
-again:
- inode = lookup_free_space_inode(fs_info, block_group, path);
- if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
- ret = PTR_ERR(inode);
- btrfs_release_path(path);
- goto out;
- }
-
- if (IS_ERR(inode)) {
- BUG_ON(retries);
- retries++;
-
- if (block_group->ro)
- goto out_free;
-
- ret = create_free_space_inode(fs_info, trans, block_group,
- path);
- if (ret)
- goto out_free;
- goto again;
- }
-
- /*
- * We want to set the generation to 0, that way if anything goes wrong
- * from here on out we know not to trust this cache when we load up next
- * time.
- */
- BTRFS_I(inode)->generation = 0;
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- /*
- * So theoretically we could recover from this, simply set the
- * super cache generation to 0 so we know to invalidate the
- * cache, but then we'd have to keep track of the block groups
- * that fail this way so we know we _have_ to reset this cache
- * before the next commit or risk reading stale cache. So to
- * limit our exposure to horrible edge cases lets just abort the
- * transaction, this only happens in really bad situations
- * anyway.
- */
- btrfs_abort_transaction(trans, ret);
- goto out_put;
- }
- WARN_ON(ret);
-
- /* We've already setup this transaction, go ahead and exit */
- if (block_group->cache_generation == trans->transid &&
- i_size_read(inode)) {
- dcs = BTRFS_DC_SETUP;
- goto out_put;
- }
-
- if (i_size_read(inode) > 0) {
- ret = btrfs_check_trunc_cache_free_space(fs_info,
- &fs_info->global_block_rsv);
- if (ret)
- goto out_put;
-
- ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
- if (ret)
- goto out_put;
- }
-
- spin_lock(&block_group->lock);
- if (block_group->cached != BTRFS_CACHE_FINISHED ||
- !btrfs_test_opt(fs_info, SPACE_CACHE)) {
- /*
- * don't bother trying to write stuff out _if_
- * a) we're not cached,
- * b) we're with nospace_cache mount option,
- * c) we're with v2 space_cache (FREE_SPACE_TREE).
- */
- dcs = BTRFS_DC_WRITTEN;
- spin_unlock(&block_group->lock);
- goto out_put;
- }
- spin_unlock(&block_group->lock);
-
- /*
- * We hit an ENOSPC when setting up the cache in this transaction, just
- * skip doing the setup, we've already cleared the cache so we're safe.
- */
- if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
- ret = -ENOSPC;
- goto out_put;
- }
-
- /*
- * Try to preallocate enough space based on how big the block group is.
- * Keep in mind this has to include any pinned space which could end up
- * taking up quite a bit since it's not folded into the other space
- * cache.
- */
- num_pages = div_u64(block_group->key.offset, SZ_256M);
- if (!num_pages)
- num_pages = 1;
-
- num_pages *= 16;
- num_pages *= PAGE_SIZE;
-
- ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
- if (ret)
- goto out_put;
-
- ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
- num_pages, num_pages,
- &alloc_hint);
- /*
- * Our cache requires contiguous chunks so that we don't modify a bunch
- * of metadata or split extents when writing the cache out, which means
- * we can enospc if we are heavily fragmented in addition to just normal
- * out of space conditions. So if we hit this just skip setting up any
- * other block groups for this transaction, maybe we'll unpin enough
- * space the next time around.
- */
- if (!ret)
- dcs = BTRFS_DC_SETUP;
- else if (ret == -ENOSPC)
- set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
-
-out_put:
- iput(inode);
-out_free:
- btrfs_release_path(path);
-out:
- spin_lock(&block_group->lock);
- if (!ret && dcs == BTRFS_DC_SETUP)
- block_group->cache_generation = trans->transid;
- block_group->disk_cache_state = dcs;
- spin_unlock(&block_group->lock);
-
- extent_changeset_free(data_reserved);
- return ret;
-}
-
-int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct btrfs_block_group_cache *cache, *tmp;
- struct btrfs_transaction *cur_trans = trans->transaction;
- struct btrfs_path *path;
-
- if (list_empty(&cur_trans->dirty_bgs) ||
- !btrfs_test_opt(fs_info, SPACE_CACHE))
- return 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- /* Could add new block groups, use _safe just in case */
- list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
- dirty_list) {
- if (cache->disk_cache_state == BTRFS_DC_CLEAR)
- cache_save_setup(cache, trans, path);
- }
-
- btrfs_free_path(path);
- return 0;
-}
-
-/*
- * transaction commit does final block group cache writeback during a
- * critical section where nothing is allowed to change the FS. This is
- * required in order for the cache to actually match the block group,
- * but can introduce a lot of latency into the commit.
- *
- * So, btrfs_start_dirty_block_groups is here to kick off block group
- * cache IO. There's a chance we'll have to redo some of it if the
- * block group changes again during the commit, but it greatly reduces
- * the commit latency by getting rid of the easy block groups while
- * we're still allowing others to join the commit.
- */
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *cache;
- struct btrfs_transaction *cur_trans = trans->transaction;
- int ret = 0;
- int should_put;
- struct btrfs_path *path = NULL;
- LIST_HEAD(dirty);
- struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
- int loops = 0;
-
- spin_lock(&cur_trans->dirty_bgs_lock);
- if (list_empty(&cur_trans->dirty_bgs)) {
- spin_unlock(&cur_trans->dirty_bgs_lock);
- return 0;
- }
- list_splice_init(&cur_trans->dirty_bgs, &dirty);
- spin_unlock(&cur_trans->dirty_bgs_lock);
-
-again:
- /*
- * make sure all the block groups on our dirty list actually
- * exist
- */
- btrfs_create_pending_block_groups(trans);
-
- if (!path) {
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- }
-
- /*
- * cache_write_mutex is here only to save us from balance or automatic
- * removal of empty block groups deleting this block group while we are
- * writing out the cache
- */
- mutex_lock(&trans->transaction->cache_write_mutex);
- while (!list_empty(&dirty)) {
- bool drop_reserve = true;
-
- cache = list_first_entry(&dirty,
- struct btrfs_block_group_cache,
- dirty_list);
- /*
- * this can happen if something re-dirties a block
- * group that is already under IO. Just wait for it to
- * finish and then do it all again
- */
- if (!list_empty(&cache->io_list)) {
- list_del_init(&cache->io_list);
- btrfs_wait_cache_io(trans, cache, path);
- btrfs_put_block_group(cache);
- }
-
-
- /*
- * btrfs_wait_cache_io uses the cache->dirty_list to decide
- * if it should update the cache_state. Don't delete
- * until after we wait.
- *
- * Since we're not running in the commit critical section
- * we need the dirty_bgs_lock to protect from update_block_group
- */
- spin_lock(&cur_trans->dirty_bgs_lock);
- list_del_init(&cache->dirty_list);
- spin_unlock(&cur_trans->dirty_bgs_lock);
-
- should_put = 1;
-
- cache_save_setup(cache, trans, path);
-
- if (cache->disk_cache_state == BTRFS_DC_SETUP) {
- cache->io_ctl.inode = NULL;
- ret = btrfs_write_out_cache(fs_info, trans,
- cache, path);
- if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
- should_put = 0;
-
- /*
- * The cache_write_mutex is protecting the
- * io_list, also refer to the definition of
- * btrfs_transaction::io_bgs for more details
- */
- list_add_tail(&cache->io_list, io);
- } else {
- /*
- * if we failed to write the cache, the
- * generation will be bad and life goes on
- */
- ret = 0;
- }
- }
- if (!ret) {
- ret = write_one_cache_group(trans, fs_info,
- path, cache);
- /*
- * Our block group might still be attached to the list
- * of new block groups in the transaction handle of some
- * other task (struct btrfs_trans_handle->new_bgs). This
- * means its block group item isn't yet in the extent
- * tree. If this happens ignore the error, as we will
- * try again later in the critical section of the
- * transaction commit.
- */
- if (ret == -ENOENT) {
- ret = 0;
- spin_lock(&cur_trans->dirty_bgs_lock);
- if (list_empty(&cache->dirty_list)) {
- list_add_tail(&cache->dirty_list,
- &cur_trans->dirty_bgs);
- btrfs_get_block_group(cache);
- drop_reserve = false;
- }
- spin_unlock(&cur_trans->dirty_bgs_lock);
- } else if (ret) {
- btrfs_abort_transaction(trans, ret);
- }
- }
-
- /* if it's not on the io list, we need to put the block group */
- if (should_put)
- btrfs_put_block_group(cache);
- if (drop_reserve)
- btrfs_delayed_refs_rsv_release(fs_info, 1);
-
- if (ret)
- break;
-
- /*
- * Avoid blocking other tasks for too long. It might even save
- * us from writing caches for block groups that are going to be
- * removed.
- */
- mutex_unlock(&trans->transaction->cache_write_mutex);
- mutex_lock(&trans->transaction->cache_write_mutex);
- }
- mutex_unlock(&trans->transaction->cache_write_mutex);
-
- /*
- * go through delayed refs for all the stuff we've just kicked off
- * and then loop back (just once)
- */
- ret = btrfs_run_delayed_refs(trans, 0);
- if (!ret && loops == 0) {
- loops++;
- spin_lock(&cur_trans->dirty_bgs_lock);
- list_splice_init(&cur_trans->dirty_bgs, &dirty);
- /*
- * dirty_bgs_lock protects us from concurrent block group
- * deletes too (not just cache_write_mutex).
- */
- if (!list_empty(&dirty)) {
- spin_unlock(&cur_trans->dirty_bgs_lock);
- goto again;
- }
- spin_unlock(&cur_trans->dirty_bgs_lock);
- } else if (ret < 0) {
- btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
- }
-
- btrfs_free_path(path);
- return ret;
-}
-
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct btrfs_block_group_cache *cache;
- struct btrfs_transaction *cur_trans = trans->transaction;
- int ret = 0;
- int should_put;
- struct btrfs_path *path;
- struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- /*
- * Even though we are in the critical section of the transaction commit,
- * we can still have concurrent tasks adding elements to this
- * transaction's list of dirty block groups. These tasks correspond to
- * endio free space workers started when writeback finishes for a
- * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
- * allocate new block groups as a result of COWing nodes of the root
- * tree when updating the free space inode. The writeback for the space
- * caches is triggered by an earlier call to
- * btrfs_start_dirty_block_groups() and iterations of the following
- * loop.
- * Also we want to do the cache_save_setup first and then run the
- * delayed refs to make sure we have the best chance at doing this all
- * in one shot.
- */
- spin_lock(&cur_trans->dirty_bgs_lock);
- while (!list_empty(&cur_trans->dirty_bgs)) {
- cache = list_first_entry(&cur_trans->dirty_bgs,
- struct btrfs_block_group_cache,
- dirty_list);
-
- /*
- * this can happen if cache_save_setup re-dirties a block
- * group that is already under IO. Just wait for it to
- * finish and then do it all again
- */
- if (!list_empty(&cache->io_list)) {
- spin_unlock(&cur_trans->dirty_bgs_lock);
- list_del_init(&cache->io_list);
- btrfs_wait_cache_io(trans, cache, path);
- btrfs_put_block_group(cache);
- spin_lock(&cur_trans->dirty_bgs_lock);
- }
-
- /*
- * don't remove from the dirty list until after we've waited
- * on any pending IO
- */
- list_del_init(&cache->dirty_list);
- spin_unlock(&cur_trans->dirty_bgs_lock);
- should_put = 1;
-
- cache_save_setup(cache, trans, path);
-
- if (!ret)
- ret = btrfs_run_delayed_refs(trans,
- (unsigned long) -1);
-
- if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
- cache->io_ctl.inode = NULL;
- ret = btrfs_write_out_cache(fs_info, trans,
- cache, path);
- if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
- should_put = 0;
- list_add_tail(&cache->io_list, io);
- } else {
- /*
- * if we failed to write the cache, the
- * generation will be bad and life goes on
- */
- ret = 0;
- }
- }
- if (!ret) {
- ret = write_one_cache_group(trans, fs_info,
- path, cache);
- /*
- * One of the free space endio workers might have
- * created a new block group while updating a free space
- * cache's inode (at inode.c:btrfs_finish_ordered_io())
- * and hasn't released its transaction handle yet, in
- * which case the new block group is still attached to
- * its transaction handle and its creation has not
- * finished yet (no block group item in the extent tree
- * yet, etc). If this is the case, wait for all free
- * space endio workers to finish and retry. This is a
- * a very rare case so no need for a more efficient and
- * complex approach.
- */
- if (ret == -ENOENT) {
- wait_event(cur_trans->writer_wait,
- atomic_read(&cur_trans->num_writers) == 1);
- ret = write_one_cache_group(trans, fs_info,
- path, cache);
- }
- if (ret)
- btrfs_abort_transaction(trans, ret);
- }
-
- /* if its not on the io list, we need to put the block group */
- if (should_put)
- btrfs_put_block_group(cache);
- btrfs_delayed_refs_rsv_release(fs_info, 1);
- spin_lock(&cur_trans->dirty_bgs_lock);
- }
- spin_unlock(&cur_trans->dirty_bgs_lock);
-
- /*
- * Refer to the definition of io_bgs member for details why it's safe
- * to use it without any locking
- */
- while (!list_empty(io)) {
- cache = list_first_entry(io, struct btrfs_block_group_cache,
- io_list);
- list_del_init(&cache->io_list);
- btrfs_wait_cache_io(trans, cache, path);
- btrfs_put_block_group(cache);
- }
-
- btrfs_free_path(path);
- return ret;
-}
-
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
int readonly = 0;
block_group = btrfs_lookup_block_group(fs_info, bytenr);
@@ -3886,254 +2548,6 @@ int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
return readonly;
}
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
- struct btrfs_block_group_cache *bg;
- bool ret = true;
-
- bg = btrfs_lookup_block_group(fs_info, bytenr);
- if (!bg)
- return false;
-
- spin_lock(&bg->lock);
- if (bg->ro)
- ret = false;
- else
- atomic_inc(&bg->nocow_writers);
- spin_unlock(&bg->lock);
-
- /* no put on block group, done by btrfs_dec_nocow_writers */
- if (!ret)
- btrfs_put_block_group(bg);
-
- return ret;
-
-}
-
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
- struct btrfs_block_group_cache *bg;
-
- bg = btrfs_lookup_block_group(fs_info, bytenr);
- ASSERT(bg);
- if (atomic_dec_and_test(&bg->nocow_writers))
- wake_up_var(&bg->nocow_writers);
- /*
- * Once for our lookup and once for the lookup done by a previous call
- * to btrfs_inc_nocow_writers()
- */
- btrfs_put_block_group(bg);
- btrfs_put_block_group(bg);
-}
-
-void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
-{
- wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
-}
-
-static const char *alloc_name(u64 flags)
-{
- switch (flags) {
- case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
- return "mixed";
- case BTRFS_BLOCK_GROUP_METADATA:
- return "metadata";
- case BTRFS_BLOCK_GROUP_DATA:
- return "data";
- case BTRFS_BLOCK_GROUP_SYSTEM:
- return "system";
- default:
- WARN_ON(1);
- return "invalid-combination";
- };
-}
-
-static int create_space_info(struct btrfs_fs_info *info, u64 flags)
-{
-
- struct btrfs_space_info *space_info;
- int i;
- int ret;
-
- space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
- if (!space_info)
- return -ENOMEM;
-
- ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
- GFP_KERNEL);
- if (ret) {
- kfree(space_info);
- return ret;
- }
-
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
- INIT_LIST_HEAD(&space_info->block_groups[i]);
- init_rwsem(&space_info->groups_sem);
- spin_lock_init(&space_info->lock);
- space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
- space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
- init_waitqueue_head(&space_info->wait);
- INIT_LIST_HEAD(&space_info->ro_bgs);
- INIT_LIST_HEAD(&space_info->tickets);
- INIT_LIST_HEAD(&space_info->priority_tickets);
-
- ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
- info->space_info_kobj, "%s",
- alloc_name(space_info->flags));
- if (ret) {
- percpu_counter_destroy(&space_info->total_bytes_pinned);
- kfree(space_info);
- return ret;
- }
-
- list_add_rcu(&space_info->list, &info->space_info);
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- info->data_sinfo = space_info;
-
- return ret;
-}
-
-static void update_space_info(struct btrfs_fs_info *info, u64 flags,
- u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly,
- struct btrfs_space_info **space_info)
-{
- struct btrfs_space_info *found;
- int factor;
-
- factor = btrfs_bg_type_to_factor(flags);
-
- found = __find_space_info(info, flags);
- ASSERT(found);
- spin_lock(&found->lock);
- found->total_bytes += total_bytes;
- found->disk_total += total_bytes * factor;
- found->bytes_used += bytes_used;
- found->disk_used += bytes_used * factor;
- found->bytes_readonly += bytes_readonly;
- if (total_bytes > 0)
- found->full = 0;
- space_info_add_new_bytes(info, found, total_bytes -
- bytes_used - bytes_readonly);
- spin_unlock(&found->lock);
- *space_info = found;
-}
-
-static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
-{
- u64 extra_flags = chunk_to_extended(flags) &
- BTRFS_EXTENDED_PROFILE_MASK;
-
- write_seqlock(&fs_info->profiles_lock);
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- fs_info->avail_data_alloc_bits |= extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_METADATA)
- fs_info->avail_metadata_alloc_bits |= extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- fs_info->avail_system_alloc_bits |= extra_flags;
- write_sequnlock(&fs_info->profiles_lock);
-}
-
-/*
- * returns target flags in extended format or 0 if restripe for this
- * chunk_type is not in progress
- *
- * should be called with balance_lock held
- */
-static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
-{
- struct btrfs_balance_control *bctl = fs_info->balance_ctl;
- u64 target = 0;
-
- if (!bctl)
- return 0;
-
- if (flags & BTRFS_BLOCK_GROUP_DATA &&
- bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
- target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
- } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
- bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
- target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
- } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
- bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
- target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
- }
-
- return target;
-}
-
-/*
- * @flags: available profiles in extended format (see ctree.h)
- *
- * Returns reduced profile in chunk format. If profile changing is in
- * progress (either running or paused) picks the target profile (if it's
- * already available), otherwise falls back to plain reducing.
- */
-static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
-{
- u64 num_devices = fs_info->fs_devices->rw_devices;
- u64 target;
- u64 raid_type;
- u64 allowed = 0;
-
- /*
- * see if restripe for this chunk_type is in progress, if so
- * try to reduce to the target profile
- */
- spin_lock(&fs_info->balance_lock);
- target = get_restripe_target(fs_info, flags);
- if (target) {
- /* pick target profile only if it's already available */
- if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
- spin_unlock(&fs_info->balance_lock);
- return extended_to_chunk(target);
- }
- }
- spin_unlock(&fs_info->balance_lock);
-
- /* First, mask out the RAID levels which aren't possible */
- for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
- if (num_devices >= btrfs_raid_array[raid_type].devs_min)
- allowed |= btrfs_raid_array[raid_type].bg_flag;
- }
- allowed &= flags;
-
- if (allowed & BTRFS_BLOCK_GROUP_RAID6)
- allowed = BTRFS_BLOCK_GROUP_RAID6;
- else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
- allowed = BTRFS_BLOCK_GROUP_RAID5;
- else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
- allowed = BTRFS_BLOCK_GROUP_RAID10;
- else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
- allowed = BTRFS_BLOCK_GROUP_RAID1;
- else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
- allowed = BTRFS_BLOCK_GROUP_RAID0;
-
- flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
-
- return extended_to_chunk(flags | allowed);
-}
-
-static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
-{
- unsigned seq;
- u64 flags;
-
- do {
- flags = orig_flags;
- seq = read_seqbegin(&fs_info->profiles_lock);
-
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- flags |= fs_info->avail_data_alloc_bits;
- else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- flags |= fs_info->avail_system_alloc_bits;
- else if (flags & BTRFS_BLOCK_GROUP_METADATA)
- flags |= fs_info->avail_metadata_alloc_bits;
- } while (read_seqretry(&fs_info->profiles_lock, seq));
-
- return btrfs_reduce_alloc_profile(fs_info, flags);
-}
-
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4147,2331 +2561,13 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
else
flags = BTRFS_BLOCK_GROUP_METADATA;
- ret = get_alloc_profile(fs_info, flags);
- return ret;
-}
-
-u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
-{
- return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
-}
-
-u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
-{
- return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
-}
-
-u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
-{
- return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
-}
-
-static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
- bool may_use_included)
-{
- ASSERT(s_info);
- return s_info->bytes_used + s_info->bytes_reserved +
- s_info->bytes_pinned + s_info->bytes_readonly +
- (may_use_included ? s_info->bytes_may_use : 0);
-}
-
-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
- u64 used;
- int ret = 0;
- int need_commit = 2;
- int have_pinned_space;
-
- /* make sure bytes are sectorsize aligned */
- bytes = ALIGN(bytes, fs_info->sectorsize);
-
- if (btrfs_is_free_space_inode(inode)) {
- need_commit = 0;
- ASSERT(current->journal_info);
- }
-
-again:
- /* make sure we have enough space to handle the data first */
- spin_lock(&data_sinfo->lock);
- used = btrfs_space_info_used(data_sinfo, true);
-
- if (used + bytes > data_sinfo->total_bytes) {
- struct btrfs_trans_handle *trans;
-
- /*
- * if we don't have enough free bytes in this space then we need
- * to alloc a new chunk.
- */
- if (!data_sinfo->full) {
- u64 alloc_target;
-
- data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
- spin_unlock(&data_sinfo->lock);
-
- alloc_target = btrfs_data_alloc_profile(fs_info);
- /*
- * It is ugly that we don't call nolock join
- * transaction for the free space inode case here.
- * But it is safe because we only do the data space
- * reservation for the free space cache in the
- * transaction context, the common join transaction
- * just increase the counter of the current transaction
- * handler, doesn't try to acquire the trans_lock of
- * the fs.
- */
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = do_chunk_alloc(trans, alloc_target,
- CHUNK_ALLOC_NO_FORCE);
- btrfs_end_transaction(trans);
- if (ret < 0) {
- if (ret != -ENOSPC)
- return ret;
- else {
- have_pinned_space = 1;
- goto commit_trans;
- }
- }
-
- goto again;
- }
-
- /*
- * If we don't have enough pinned space to deal with this
- * allocation, and no removed chunk in current transaction,
- * don't bother committing the transaction.
- */
- have_pinned_space = __percpu_counter_compare(
- &data_sinfo->total_bytes_pinned,
- used + bytes - data_sinfo->total_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- spin_unlock(&data_sinfo->lock);
-
- /* commit the current transaction and try again */
-commit_trans:
- if (need_commit) {
- need_commit--;
-
- if (need_commit > 0) {
- btrfs_start_delalloc_roots(fs_info, -1);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
- (u64)-1);
- }
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- if (have_pinned_space >= 0 ||
- test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
- &trans->transaction->flags) ||
- need_commit > 0) {
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
- /*
- * The cleaner kthread might still be doing iput
- * operations. Wait for it to finish so that
- * more space is released. We don't need to
- * explicitly run the delayed iputs here because
- * the commit_transaction would have woken up
- * the cleaner.
- */
- ret = btrfs_wait_on_delayed_iputs(fs_info);
- if (ret)
- return ret;
- goto again;
- } else {
- btrfs_end_transaction(trans);
- }
- }
-
- trace_btrfs_space_reservation(fs_info,
- "space_info:enospc",
- data_sinfo->flags, bytes, 1);
- return -ENOSPC;
- }
- update_bytes_may_use(data_sinfo, bytes);
- trace_btrfs_space_reservation(fs_info, "space_info",
- data_sinfo->flags, bytes, 1);
- spin_unlock(&data_sinfo->lock);
-
- return 0;
-}
-
-int btrfs_check_data_free_space(struct inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int ret;
-
- /* align the range */
- len = round_up(start + len, fs_info->sectorsize) -
- round_down(start, fs_info->sectorsize);
- start = round_down(start, fs_info->sectorsize);
-
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
- if (ret < 0)
- return ret;
-
- /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
- ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
- if (ret < 0)
- btrfs_free_reserved_data_space_noquota(inode, start, len);
- else
- ret = 0;
- return ret;
-}
-
-/*
- * Called if we need to clear a data reservation for this inode
- * Normally in a error case.
- *
- * This one will *NOT* use accurate qgroup reserved space API, just for case
- * which we can't sleep and is sure it won't affect qgroup reserved space.
- * Like clear_bit_hook().
- */
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
- u64 len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_space_info *data_sinfo;
-
- /* Make sure the range is aligned to sectorsize */
- len = round_up(start + len, fs_info->sectorsize) -
- round_down(start, fs_info->sectorsize);
- start = round_down(start, fs_info->sectorsize);
-
- data_sinfo = fs_info->data_sinfo;
- spin_lock(&data_sinfo->lock);
- update_bytes_may_use(data_sinfo, -len);
- trace_btrfs_space_reservation(fs_info, "space_info",
- data_sinfo->flags, len, 0);
- spin_unlock(&data_sinfo->lock);
-}
-
-/*
- * Called if we need to clear a data reservation for this inode
- * Normally in a error case.
- *
- * This one will handle the per-inode data rsv map for accurate reserved
- * space framework.
- */
-void btrfs_free_reserved_data_space(struct inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len)
-{
- struct btrfs_root *root = BTRFS_I(inode)->root;
-
- /* Make sure the range is aligned to sectorsize */
- len = round_up(start + len, root->fs_info->sectorsize) -
- round_down(start, root->fs_info->sectorsize);
- start = round_down(start, root->fs_info->sectorsize);
-
- btrfs_free_reserved_data_space_noquota(inode, start, len);
- btrfs_qgroup_free_data(inode, reserved, start, len);
-}
-
-static void force_metadata_allocation(struct btrfs_fs_info *info)
-{
- struct list_head *head = &info->space_info;
- struct btrfs_space_info *found;
-
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
- if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
- found->force_alloc = CHUNK_ALLOC_FORCE;
- }
- rcu_read_unlock();
-}
-
-static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
-{
- return (global->size << 1);
-}
-
-static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *sinfo, int force)
-{
- u64 bytes_used = btrfs_space_info_used(sinfo, false);
- u64 thresh;
-
- if (force == CHUNK_ALLOC_FORCE)
- return 1;
-
- /*
- * in limited mode, we want to have some free space up to
- * about 1% of the FS size.
- */
- if (force == CHUNK_ALLOC_LIMITED) {
- thresh = btrfs_super_total_bytes(fs_info->super_copy);
- thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
-
- if (sinfo->total_bytes - bytes_used < thresh)
- return 1;
- }
-
- if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
- return 0;
- return 1;
-}
-
-static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
-{
- u64 num_dev;
-
- if (type & (BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6))
- num_dev = fs_info->fs_devices->rw_devices;
- else if (type & BTRFS_BLOCK_GROUP_RAID1)
- num_dev = 2;
- else
- num_dev = 1; /* DUP or single */
-
- return num_dev;
-}
-
-/*
- * If @is_allocation is true, reserve space in the system space info necessary
- * for allocating a chunk, otherwise if it's false, reserve space necessary for
- * removing a chunk.
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_space_info *info;
- u64 left;
- u64 thresh;
- int ret = 0;
- u64 num_devs;
-
- /*
- * Needed because we can end up allocating a system chunk and for an
- * atomic and race free space reservation in the chunk block reserve.
- */
- lockdep_assert_held(&fs_info->chunk_mutex);
-
- info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
- spin_lock(&info->lock);
- left = info->total_bytes - btrfs_space_info_used(info, true);
- spin_unlock(&info->lock);
-
- num_devs = get_profile_num_devs(fs_info, type);
-
- /* num_devs device items to update and 1 chunk item to add or remove */
- thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
- btrfs_calc_trans_metadata_size(fs_info, 1);
-
- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
- btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
- left, thresh, type);
- dump_space_info(fs_info, info, 0, 0);
- }
-
- if (left < thresh) {
- u64 flags = btrfs_system_alloc_profile(fs_info);
-
- /*
- * Ignore failure to create system chunk. We might end up not
- * needing it, as we might not need to COW all nodes/leafs from
- * the paths we visit in the chunk tree (they were already COWed
- * or created in the current transaction for example).
- */
- ret = btrfs_alloc_chunk(trans, flags);
- }
-
- if (!ret) {
- ret = btrfs_block_rsv_add(fs_info->chunk_root,
- &fs_info->chunk_block_rsv,
- thresh, BTRFS_RESERVE_NO_FLUSH);
- if (!ret)
- trans->chunk_bytes_reserved += thresh;
- }
-}
-
-/*
- * If force is CHUNK_ALLOC_FORCE:
- * - return 1 if it successfully allocates a chunk,
- * - return errors including -ENOSPC otherwise.
- * If force is NOT CHUNK_ALLOC_FORCE:
- * - return 0 if it doesn't need to allocate a new chunk,
- * - return 1 if it successfully allocates a chunk,
- * - return errors including -ENOSPC otherwise.
- */
-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
- int force)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_space_info *space_info;
- bool wait_for_alloc = false;
- bool should_alloc = false;
- int ret = 0;
-
- /* Don't re-enter if we're already allocating a chunk */
- if (trans->allocating_chunk)
- return -ENOSPC;
-
- space_info = __find_space_info(fs_info, flags);
- ASSERT(space_info);
-
- do {
- spin_lock(&space_info->lock);
- if (force < space_info->force_alloc)
- force = space_info->force_alloc;
- should_alloc = should_alloc_chunk(fs_info, space_info, force);
- if (space_info->full) {
- /* No more free physical space */
- if (should_alloc)
- ret = -ENOSPC;
- else
- ret = 0;
- spin_unlock(&space_info->lock);
- return ret;
- } else if (!should_alloc) {
- spin_unlock(&space_info->lock);
- return 0;
- } else if (space_info->chunk_alloc) {
- /*
- * Someone is already allocating, so we need to block
- * until this someone is finished and then loop to
- * recheck if we should continue with our allocation
- * attempt.
- */
- wait_for_alloc = true;
- spin_unlock(&space_info->lock);
- mutex_lock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->chunk_mutex);
- } else {
- /* Proceed with allocation */
- space_info->chunk_alloc = 1;
- wait_for_alloc = false;
- spin_unlock(&space_info->lock);
- }
-
- cond_resched();
- } while (wait_for_alloc);
-
- mutex_lock(&fs_info->chunk_mutex);
- trans->allocating_chunk = true;
-
- /*
- * If we have mixed data/metadata chunks we want to make sure we keep
- * allocating mixed chunks instead of individual chunks.
- */
- if (btrfs_mixed_space_info(space_info))
- flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
-
- /*
- * if we're doing a data chunk, go ahead and make sure that
- * we keep a reasonable number of metadata chunks allocated in the
- * FS as well.
- */
- if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
- fs_info->data_chunk_allocations++;
- if (!(fs_info->data_chunk_allocations %
- fs_info->metadata_ratio))
- force_metadata_allocation(fs_info);
- }
-
- /*
- * Check if we have enough space in SYSTEM chunk because we may need
- * to update devices.
- */
- check_system_chunk(trans, flags);
-
- ret = btrfs_alloc_chunk(trans, flags);
- trans->allocating_chunk = false;
-
- spin_lock(&space_info->lock);
- if (ret < 0) {
- if (ret == -ENOSPC)
- space_info->full = 1;
- else
- goto out;
- } else {
- ret = 1;
- space_info->max_extent_size = 0;
- }
-
- space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
-out:
- space_info->chunk_alloc = 0;
- spin_unlock(&space_info->lock);
- mutex_unlock(&fs_info->chunk_mutex);
- /*
- * When we allocate a new chunk we reserve space in the chunk block
- * reserve to make sure we can COW nodes/leafs in the chunk tree or
- * add new nodes/leafs to it if we end up needing to do it when
- * inserting the chunk item and updating device items as part of the
- * second phase of chunk allocation, performed by
- * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
- * large number of new block groups to create in our transaction
- * handle's new_bgs list to avoid exhausting the chunk block reserve
- * in extreme cases - like having a single transaction create many new
- * block groups when starting to write out the free space caches of all
- * the block groups that were made dirty during the lifetime of the
- * transaction.
- */
- if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
- btrfs_create_pending_block_groups(trans);
-
- return ret;
-}
-
-static int can_overcommit(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 bytes,
- enum btrfs_reserve_flush_enum flush,
- bool system_chunk)
-{
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- u64 profile;
- u64 space_size;
- u64 avail;
- u64 used;
- int factor;
-
- /* Don't overcommit when in mixed mode. */
- if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
- return 0;
-
- if (system_chunk)
- profile = btrfs_system_alloc_profile(fs_info);
- else
- profile = btrfs_metadata_alloc_profile(fs_info);
-
- used = btrfs_space_info_used(space_info, false);
-
- /*
- * We only want to allow over committing if we have lots of actual space
- * free, but if we don't have enough space to handle the global reserve
- * space then we could end up having a real enospc problem when trying
- * to allocate a chunk or some other such important allocation.
- */
- spin_lock(&global_rsv->lock);
- space_size = calc_global_rsv_need_space(global_rsv);
- spin_unlock(&global_rsv->lock);
- if (used + space_size >= space_info->total_bytes)
- return 0;
-
- used += space_info->bytes_may_use;
-
- avail = atomic64_read(&fs_info->free_chunk_space);
-
- /*
- * If we have dup, raid1 or raid10 then only half of the free
- * space is actually usable. For raid56, the space info used
- * doesn't include the parity drive, so we don't have to
- * change the math
- */
- factor = btrfs_bg_type_to_factor(profile);
- avail = div_u64(avail, factor);
-
- /*
- * If we aren't flushing all things, let us overcommit up to
- * 1/2th of the space. If we can flush, don't let us overcommit
- * too much, let it overcommit up to 1/8 of the space.
- */
- if (flush == BTRFS_RESERVE_FLUSH_ALL)
- avail >>= 3;
- else
- avail >>= 1;
-
- if (used + bytes < space_info->total_bytes + avail)
- return 1;
- return 0;
-}
-
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
- unsigned long nr_pages, int nr_items)
-{
- struct super_block *sb = fs_info->sb;
-
- if (down_read_trylock(&sb->s_umount)) {
- writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
- up_read(&sb->s_umount);
- } else {
- /*
- * We needn't worry the filesystem going from r/w to r/o though
- * we don't acquire ->s_umount mutex, because the filesystem
- * should guarantee the delalloc inodes list be empty after
- * the filesystem is readonly(all dirty pages are written to
- * the disk).
- */
- btrfs_start_delalloc_roots(fs_info, nr_items);
- if (!current->journal_info)
- btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
- }
-}
-
-static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
- u64 to_reclaim)
-{
- u64 bytes;
- u64 nr;
-
- bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
- nr = div64_u64(to_reclaim, bytes);
- if (!nr)
- nr = 1;
- return nr;
-}
-
-#define EXTENT_SIZE_PER_ITEM SZ_256K
-
-/*
- * shrink metadata reservation for delalloc
- */
-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
- u64 orig, bool wait_ordered)
-{
- struct btrfs_space_info *space_info;
- struct btrfs_trans_handle *trans;
- u64 delalloc_bytes;
- u64 async_pages;
- u64 items;
- long time_left;
- unsigned long nr_pages;
- int loops;
-
- /* Calc the number of the pages we need flush for space reservation */
- items = calc_reclaim_items_nr(fs_info, to_reclaim);
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
-
- trans = (struct btrfs_trans_handle *)current->journal_info;
- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
-
- delalloc_bytes = percpu_counter_sum_positive(
- &fs_info->delalloc_bytes);
- if (delalloc_bytes == 0) {
- if (trans)
- return;
- if (wait_ordered)
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
- return;
- }
-
- loops = 0;
- while (delalloc_bytes && loops < 3) {
- nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
-
- /*
- * Triggers inode writeback for up to nr_pages. This will invoke
- * ->writepages callback and trigger delalloc filling
- * (btrfs_run_delalloc_range()).
- */
- btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
-
- /*
- * We need to wait for the compressed pages to start before
- * we continue.
- */
- async_pages = atomic_read(&fs_info->async_delalloc_pages);
- if (!async_pages)
- goto skip_async;
-
- /*
- * Calculate how many compressed pages we want to be written
- * before we continue. I.e if there are more async pages than we
- * require wait_event will wait until nr_pages are written.
- */
- if (async_pages <= nr_pages)
- async_pages = 0;
- else
- async_pages -= nr_pages;
-
- wait_event(fs_info->async_submit_wait,
- atomic_read(&fs_info->async_delalloc_pages) <=
- (int)async_pages);
-skip_async:
- spin_lock(&space_info->lock);
- if (list_empty(&space_info->tickets) &&
- list_empty(&space_info->priority_tickets)) {
- spin_unlock(&space_info->lock);
- break;
- }
- spin_unlock(&space_info->lock);
-
- loops++;
- if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
- } else {
- time_left = schedule_timeout_killable(1);
- if (time_left)
- break;
- }
- delalloc_bytes = percpu_counter_sum_positive(
- &fs_info->delalloc_bytes);
- }
-}
-
-struct reserve_ticket {
- u64 orig_bytes;
- u64 bytes;
- int error;
- struct list_head list;
- wait_queue_head_t wait;
-};
-
-/**
- * maybe_commit_transaction - possibly commit the transaction if its ok to
- * @root - the root we're allocating for
- * @bytes - the number of bytes we want to reserve
- * @force - force the commit
- *
- * This will check to make sure that committing the transaction will actually
- * get us somewhere and then commit the transaction if it does. Otherwise it
- * will return -ENOSPC.
- */
-static int may_commit_transaction(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
-{
- struct reserve_ticket *ticket = NULL;
- struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
- struct btrfs_trans_handle *trans;
- u64 bytes_needed;
- u64 reclaim_bytes = 0;
-
- trans = (struct btrfs_trans_handle *)current->journal_info;
- if (trans)
- return -EAGAIN;
-
- spin_lock(&space_info->lock);
- if (!list_empty(&space_info->priority_tickets))
- ticket = list_first_entry(&space_info->priority_tickets,
- struct reserve_ticket, list);
- else if (!list_empty(&space_info->tickets))
- ticket = list_first_entry(&space_info->tickets,
- struct reserve_ticket, list);
- bytes_needed = (ticket) ? ticket->bytes : 0;
- spin_unlock(&space_info->lock);
-
- if (!bytes_needed)
- return 0;
-
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- /*
- * See if there is enough pinned space to make this reservation, or if
- * we have block groups that are going to be freed, allowing us to
- * possibly do a chunk allocation the next loop through.
- */
- if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
- __percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
- goto commit;
-
- /*
- * See if there is some space in the delayed insertion reservation for
- * this reservation.
- */
- if (space_info != delayed_rsv->space_info)
- goto enospc;
-
- spin_lock(&delayed_rsv->lock);
- reclaim_bytes += delayed_rsv->reserved;
- spin_unlock(&delayed_rsv->lock);
-
- spin_lock(&delayed_refs_rsv->lock);
- reclaim_bytes += delayed_refs_rsv->reserved;
- spin_unlock(&delayed_refs_rsv->lock);
- if (reclaim_bytes >= bytes_needed)
- goto commit;
- bytes_needed -= reclaim_bytes;
-
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
- goto enospc;
-
-commit:
- return btrfs_commit_transaction(trans);
-enospc:
- btrfs_end_transaction(trans);
- return -ENOSPC;
-}
-
-/*
- * Try to flush some data based on policy set by @state. This is only advisory
- * and may fail for various reasons. The caller is supposed to examine the
- * state of @space_info to detect the outcome.
- */
-static void flush_space(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 num_bytes,
- int state)
-{
- struct btrfs_root *root = fs_info->extent_root;
- struct btrfs_trans_handle *trans;
- int nr;
- int ret = 0;
-
- switch (state) {
- case FLUSH_DELAYED_ITEMS_NR:
- case FLUSH_DELAYED_ITEMS:
- if (state == FLUSH_DELAYED_ITEMS_NR)
- nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
- else
- nr = -1;
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- ret = btrfs_run_delayed_items_nr(trans, nr);
- btrfs_end_transaction(trans);
- break;
- case FLUSH_DELALLOC:
- case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
- state == FLUSH_DELALLOC_WAIT);
- break;
- case FLUSH_DELAYED_REFS_NR:
- case FLUSH_DELAYED_REFS:
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- if (state == FLUSH_DELAYED_REFS_NR)
- nr = calc_reclaim_items_nr(fs_info, num_bytes);
- else
- nr = 0;
- btrfs_run_delayed_refs(trans, nr);
- btrfs_end_transaction(trans);
- break;
- case ALLOC_CHUNK:
- case ALLOC_CHUNK_FORCE:
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- ret = do_chunk_alloc(trans,
- btrfs_metadata_alloc_profile(fs_info),
- (state == ALLOC_CHUNK) ?
- CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
- btrfs_end_transaction(trans);
- if (ret > 0 || ret == -ENOSPC)
- ret = 0;
- break;
- case COMMIT_TRANS:
- /*
- * If we have pending delayed iputs then we could free up a
- * bunch of pinned space, so make sure we run the iputs before
- * we do our pinned bytes check below.
- */
- btrfs_run_delayed_iputs(fs_info);
- btrfs_wait_on_delayed_iputs(fs_info);
-
- ret = may_commit_transaction(fs_info, space_info);
- break;
- default:
- ret = -ENOSPC;
- break;
- }
-
- trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
- ret);
- return;
-}
-
-static inline u64
-btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- bool system_chunk)
-{
- struct reserve_ticket *ticket;
- u64 used;
- u64 expected;
- u64 to_reclaim = 0;
-
- list_for_each_entry(ticket, &space_info->tickets, list)
- to_reclaim += ticket->bytes;
- list_for_each_entry(ticket, &space_info->priority_tickets, list)
- to_reclaim += ticket->bytes;
- if (to_reclaim)
- return to_reclaim;
-
- to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
- if (can_overcommit(fs_info, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL, system_chunk))
- return 0;
-
- used = btrfs_space_info_used(space_info, true);
-
- if (can_overcommit(fs_info, space_info, SZ_1M,
- BTRFS_RESERVE_FLUSH_ALL, system_chunk))
- expected = div_factor_fine(space_info->total_bytes, 95);
- else
- expected = div_factor_fine(space_info->total_bytes, 90);
-
- if (used > expected)
- to_reclaim = used - expected;
- else
- to_reclaim = 0;
- to_reclaim = min(to_reclaim, space_info->bytes_may_use +
- space_info->bytes_reserved);
- return to_reclaim;
-}
-
-static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 used, bool system_chunk)
-{
- u64 thresh = div_factor_fine(space_info->total_bytes, 98);
-
- /* If we're just plain full then async reclaim just slows us down. */
- if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
- return 0;
-
- if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
- system_chunk))
- return 0;
-
- return (used >= thresh && !btrfs_fs_closing(fs_info) &&
- !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
-}
-
-static bool wake_all_tickets(struct list_head *head)
-{
- struct reserve_ticket *ticket;
-
- while (!list_empty(head)) {
- ticket = list_first_entry(head, struct reserve_ticket, list);
- list_del_init(&ticket->list);
- ticket->error = -ENOSPC;
- wake_up(&ticket->wait);
- if (ticket->bytes != ticket->orig_bytes)
- return true;
- }
- return false;
-}
-
-/*
- * This is for normal flushers, we can wait all goddamned day if we want to. We
- * will loop and continuously try to flush as long as we are making progress.
- * We count progress as clearing off tickets each time we have to loop.
- */
-static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
-{
- struct btrfs_fs_info *fs_info;
- struct btrfs_space_info *space_info;
- u64 to_reclaim;
- int flush_state;
- int commit_cycles = 0;
- u64 last_tickets_id;
-
- fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
-
- spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
- false);
- if (!to_reclaim) {
- space_info->flush = 0;
- spin_unlock(&space_info->lock);
- return;
- }
- last_tickets_id = space_info->tickets_id;
- spin_unlock(&space_info->lock);
-
- flush_state = FLUSH_DELAYED_ITEMS_NR;
- do {
- flush_space(fs_info, space_info, to_reclaim, flush_state);
- spin_lock(&space_info->lock);
- if (list_empty(&space_info->tickets)) {
- space_info->flush = 0;
- spin_unlock(&space_info->lock);
- return;
- }
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
- space_info,
- false);
- if (last_tickets_id == space_info->tickets_id) {
- flush_state++;
- } else {
- last_tickets_id = space_info->tickets_id;
- flush_state = FLUSH_DELAYED_ITEMS_NR;
- if (commit_cycles)
- commit_cycles--;
- }
-
- /*
- * We don't want to force a chunk allocation until we've tried
- * pretty hard to reclaim space. Think of the case where we
- * freed up a bunch of space and so have a lot of pinned space
- * to reclaim. We would rather use that than possibly create a
- * underutilized metadata chunk. So if this is our first run
- * through the flushing state machine skip ALLOC_CHUNK_FORCE and
- * commit the transaction. If nothing has changed the next go
- * around then we can force a chunk allocation.
- */
- if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
- flush_state++;
-
- if (flush_state > COMMIT_TRANS) {
- commit_cycles++;
- if (commit_cycles > 2) {
- if (wake_all_tickets(&space_info->tickets)) {
- flush_state = FLUSH_DELAYED_ITEMS_NR;
- commit_cycles--;
- } else {
- space_info->flush = 0;
- }
- } else {
- flush_state = FLUSH_DELAYED_ITEMS_NR;
- }
- }
- spin_unlock(&space_info->lock);
- } while (flush_state <= COMMIT_TRANS);
-}
-
-void btrfs_init_async_reclaim_work(struct work_struct *work)
-{
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
-}
-
-static const enum btrfs_flush_state priority_flush_states[] = {
- FLUSH_DELAYED_ITEMS_NR,
- FLUSH_DELAYED_ITEMS,
- ALLOC_CHUNK,
-};
-
-static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- struct reserve_ticket *ticket)
-{
- u64 to_reclaim;
- int flush_state;
-
- spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
- false);
- if (!to_reclaim) {
- spin_unlock(&space_info->lock);
- return;
- }
- spin_unlock(&space_info->lock);
-
- flush_state = 0;
- do {
- flush_space(fs_info, space_info, to_reclaim,
- priority_flush_states[flush_state]);
- flush_state++;
- spin_lock(&space_info->lock);
- if (ticket->bytes == 0) {
- spin_unlock(&space_info->lock);
- return;
- }
- spin_unlock(&space_info->lock);
- } while (flush_state < ARRAY_SIZE(priority_flush_states));
-}
-
-static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- struct reserve_ticket *ticket)
-
-{
- DEFINE_WAIT(wait);
- u64 reclaim_bytes = 0;
- int ret = 0;
-
- spin_lock(&space_info->lock);
- while (ticket->bytes > 0 && ticket->error == 0) {
- ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
- if (ret) {
- ret = -EINTR;
- break;
- }
- spin_unlock(&space_info->lock);
-
- schedule();
-
- finish_wait(&ticket->wait, &wait);
- spin_lock(&space_info->lock);
- }
- if (!ret)
- ret = ticket->error;
- if (!list_empty(&ticket->list))
- list_del_init(&ticket->list);
- if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
- reclaim_bytes = ticket->orig_bytes - ticket->bytes;
- spin_unlock(&space_info->lock);
-
- if (reclaim_bytes)
- space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
- return ret;
-}
-
-/**
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @space_info - the space info we want to allocate from
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
- *
- * This will reserve orig_bytes number of bytes from the space info associated
- * with the block_rsv. If there is not enough space it will make an attempt to
- * flush out space to make room. It will do this by flushing delalloc if
- * possible or committing the transaction. If flush is 0 then no attempts to
- * regain reservations will be made and this will fail if there is not enough
- * space already.
- */
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush,
- bool system_chunk)
-{
- struct reserve_ticket ticket;
- u64 used;
- u64 reclaim_bytes = 0;
- int ret = 0;
-
- ASSERT(orig_bytes);
- ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
-
- spin_lock(&space_info->lock);
- ret = -ENOSPC;
- used = btrfs_space_info_used(space_info, true);
-
- /*
- * If we have enough space then hooray, make our reservation and carry
- * on. If not see if we can overcommit, and if we can, hooray carry on.
- * If not things get more complicated.
- */
- if (used + orig_bytes <= space_info->total_bytes) {
- update_bytes_may_use(space_info, orig_bytes);
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags, orig_bytes, 1);
- ret = 0;
- } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
- system_chunk)) {
- update_bytes_may_use(space_info, orig_bytes);
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags, orig_bytes, 1);
- ret = 0;
- }
-
- /*
- * If we couldn't make a reservation then setup our reservation ticket
- * and kick the async worker if it's not already running.
- *
- * If we are a priority flusher then we just need to add our ticket to
- * the list and we will do our own flushing further down.
- */
- if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
- ticket.orig_bytes = orig_bytes;
- ticket.bytes = orig_bytes;
- ticket.error = 0;
- init_waitqueue_head(&ticket.wait);
- if (flush == BTRFS_RESERVE_FLUSH_ALL) {
- list_add_tail(&ticket.list, &space_info->tickets);
- if (!space_info->flush) {
- space_info->flush = 1;
- trace_btrfs_trigger_flush(fs_info,
- space_info->flags,
- orig_bytes, flush,
- "enospc");
- queue_work(system_unbound_wq,
- &fs_info->async_reclaim_work);
- }
- } else {
- list_add_tail(&ticket.list,
- &space_info->priority_tickets);
- }
- } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
- used += orig_bytes;
- /*
- * We will do the space reservation dance during log replay,
- * which means we won't have fs_info->fs_root set, so don't do
- * the async reclaim as we will panic.
- */
- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
- need_do_async_reclaim(fs_info, space_info,
- used, system_chunk) &&
- !work_busy(&fs_info->async_reclaim_work)) {
- trace_btrfs_trigger_flush(fs_info, space_info->flags,
- orig_bytes, flush, "preempt");
- queue_work(system_unbound_wq,
- &fs_info->async_reclaim_work);
- }
- }
- spin_unlock(&space_info->lock);
- if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
- return ret;
-
- if (flush == BTRFS_RESERVE_FLUSH_ALL)
- return wait_reserve_ticket(fs_info, space_info, &ticket);
-
- ret = 0;
- priority_reclaim_metadata_space(fs_info, space_info, &ticket);
- spin_lock(&space_info->lock);
- if (ticket.bytes) {
- if (ticket.bytes < orig_bytes)
- reclaim_bytes = orig_bytes - ticket.bytes;
- list_del_init(&ticket.list);
- ret = -ENOSPC;
- }
- spin_unlock(&space_info->lock);
-
- if (reclaim_bytes)
- space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
- ASSERT(list_empty(&ticket.list));
- return ret;
-}
-
-/**
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @block_rsv - the block_rsv we're allocating for
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
- *
- * This will reserve orig_bytes number of bytes from the space info associated
- * with the block_rsv. If there is not enough space it will make an attempt to
- * flush out space to make room. It will do this by flushing delalloc if
- * possible or committing the transaction. If flush is 0 then no attempts to
- * regain reservations will be made and this will fail if there is not enough
- * space already.
- */
-static int reserve_metadata_bytes(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- int ret;
- bool system_chunk = (root == fs_info->chunk_root);
-
- ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
- orig_bytes, flush, system_chunk);
- if (ret == -ENOSPC &&
- unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
- if (block_rsv != global_rsv &&
- !block_rsv_use_bytes(global_rsv, orig_bytes))
- ret = 0;
- }
- if (ret == -ENOSPC) {
- trace_btrfs_space_reservation(fs_info, "space_info:enospc",
- block_rsv->space_info->flags,
- orig_bytes, 1);
-
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- dump_space_info(fs_info, block_rsv->space_info,
- orig_bytes, 0);
- }
- return ret;
-}
-
-static struct btrfs_block_rsv *get_block_rsv(
- const struct btrfs_trans_handle *trans,
- const struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *block_rsv = NULL;
-
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- (root == fs_info->csum_root && trans->adding_csums) ||
- (root == fs_info->uuid_root))
- block_rsv = trans->block_rsv;
-
- if (!block_rsv)
- block_rsv = root->block_rsv;
-
- if (!block_rsv)
- block_rsv = &fs_info->empty_block_rsv;
-
- return block_rsv;
-}
-
-static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- int ret = -ENOSPC;
- spin_lock(&block_rsv->lock);
- if (block_rsv->reserved >= num_bytes) {
- block_rsv->reserved -= num_bytes;
- if (block_rsv->reserved < block_rsv->size)
- block_rsv->full = 0;
- ret = 0;
- }
- spin_unlock(&block_rsv->lock);
- return ret;
-}
-
-static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, bool update_size)
-{
- spin_lock(&block_rsv->lock);
- block_rsv->reserved += num_bytes;
- if (update_size)
- block_rsv->size += num_bytes;
- else if (block_rsv->reserved >= block_rsv->size)
- block_rsv->full = 1;
- spin_unlock(&block_rsv->lock);
-}
-
-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *dest, u64 num_bytes,
- int min_factor)
-{
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- u64 min_bytes;
-
- if (global_rsv->space_info != dest->space_info)
- return -ENOSPC;
-
- spin_lock(&global_rsv->lock);
- min_bytes = div_factor(global_rsv->size, min_factor);
- if (global_rsv->reserved < min_bytes + num_bytes) {
- spin_unlock(&global_rsv->lock);
- return -ENOSPC;
- }
- global_rsv->reserved -= num_bytes;
- if (global_rsv->reserved < global_rsv->size)
- global_rsv->full = 0;
- spin_unlock(&global_rsv->lock);
-
- block_rsv_add_bytes(dest, num_bytes, true);
- return 0;
-}
-
-/**
- * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
- * @fs_info - the fs info for our fs.
- * @src - the source block rsv to transfer from.
- * @num_bytes - the number of bytes to transfer.
- *
- * This transfers up to the num_bytes amount from the src rsv to the
- * delayed_refs_rsv. Any extra bytes are returned to the space info.
- */
-void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *src,
- u64 num_bytes)
-{
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
- u64 to_free = 0;
-
- spin_lock(&src->lock);
- src->reserved -= num_bytes;
- src->size -= num_bytes;
- spin_unlock(&src->lock);
-
- spin_lock(&delayed_refs_rsv->lock);
- if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
- u64 delta = delayed_refs_rsv->size -
- delayed_refs_rsv->reserved;
- if (num_bytes > delta) {
- to_free = num_bytes - delta;
- num_bytes = delta;
- }
- } else {
- to_free = num_bytes;
- num_bytes = 0;
- }
-
- if (num_bytes)
- delayed_refs_rsv->reserved += num_bytes;
- if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
- delayed_refs_rsv->full = 1;
- spin_unlock(&delayed_refs_rsv->lock);
-
- if (num_bytes)
- trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
- 0, num_bytes, 1);
- if (to_free)
- space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info,
- to_free);
-}
-
-/**
- * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
- * @fs_info - the fs_info for our fs.
- * @flush - control how we can flush for this reservation.
- *
- * This will refill the delayed block_rsv up to 1 items size worth of space and
- * will return -ENOSPC if we can't make the reservation.
- */
-int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
- enum btrfs_reserve_flush_enum flush)
-{
- struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
- u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
- u64 num_bytes = 0;
- int ret = -ENOSPC;
-
- spin_lock(&block_rsv->lock);
- if (block_rsv->reserved < block_rsv->size) {
- num_bytes = block_rsv->size - block_rsv->reserved;
- num_bytes = min(num_bytes, limit);
- }
- spin_unlock(&block_rsv->lock);
-
- if (!num_bytes)
- return 0;
-
- ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
- num_bytes, flush);
- if (ret)
- return ret;
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
- trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
- 0, num_bytes, 1);
- return 0;
-}
-
-/*
- * This is for space we already have accounted in space_info->bytes_may_use, so
- * basically when we're returning space from block_rsv's.
- */
-static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 num_bytes)
-{
- struct reserve_ticket *ticket;
- struct list_head *head;
- u64 used;
- enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
- bool check_overcommit = false;
-
- spin_lock(&space_info->lock);
- head = &space_info->priority_tickets;
-
- /*
- * If we are over our limit then we need to check and see if we can
- * overcommit, and if we can't then we just need to free up our space
- * and not satisfy any requests.
- */
- used = btrfs_space_info_used(space_info, true);
- if (used - num_bytes >= space_info->total_bytes)
- check_overcommit = true;
-again:
- while (!list_empty(head) && num_bytes) {
- ticket = list_first_entry(head, struct reserve_ticket,
- list);
- /*
- * We use 0 bytes because this space is already reserved, so
- * adding the ticket space would be a double count.
- */
- if (check_overcommit &&
- !can_overcommit(fs_info, space_info, 0, flush, false))
- break;
- if (num_bytes >= ticket->bytes) {
- list_del_init(&ticket->list);
- num_bytes -= ticket->bytes;
- ticket->bytes = 0;
- space_info->tickets_id++;
- wake_up(&ticket->wait);
- } else {
- ticket->bytes -= num_bytes;
- num_bytes = 0;
- }
- }
-
- if (num_bytes && head == &space_info->priority_tickets) {
- head = &space_info->tickets;
- flush = BTRFS_RESERVE_FLUSH_ALL;
- goto again;
- }
- update_bytes_may_use(space_info, -num_bytes);
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags, num_bytes, 0);
- spin_unlock(&space_info->lock);
-}
-
-/*
- * This is for newly allocated space that isn't accounted in
- * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
- * we use this helper.
- */
-static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 num_bytes)
-{
- struct reserve_ticket *ticket;
- struct list_head *head = &space_info->priority_tickets;
-
-again:
- while (!list_empty(head) && num_bytes) {
- ticket = list_first_entry(head, struct reserve_ticket,
- list);
- if (num_bytes >= ticket->bytes) {
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags,
- ticket->bytes, 1);
- list_del_init(&ticket->list);
- num_bytes -= ticket->bytes;
- update_bytes_may_use(space_info, ticket->bytes);
- ticket->bytes = 0;
- space_info->tickets_id++;
- wake_up(&ticket->wait);
- } else {
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags,
- num_bytes, 1);
- update_bytes_may_use(space_info, num_bytes);
- ticket->bytes -= num_bytes;
- num_bytes = 0;
- }
- }
-
- if (num_bytes && head == &space_info->priority_tickets) {
- head = &space_info->tickets;
- goto again;
- }
-}
-
-static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
- struct btrfs_block_rsv *dest, u64 num_bytes,
- u64 *qgroup_to_release_ret)
-{
- struct btrfs_space_info *space_info = block_rsv->space_info;
- u64 qgroup_to_release = 0;
- u64 ret;
-
- spin_lock(&block_rsv->lock);
- if (num_bytes == (u64)-1) {
- num_bytes = block_rsv->size;
- qgroup_to_release = block_rsv->qgroup_rsv_size;
- }
- block_rsv->size -= num_bytes;
- if (block_rsv->reserved >= block_rsv->size) {
- num_bytes = block_rsv->reserved - block_rsv->size;
- block_rsv->reserved = block_rsv->size;
- block_rsv->full = 1;
- } else {
- num_bytes = 0;
- }
- if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
- qgroup_to_release = block_rsv->qgroup_rsv_reserved -
- block_rsv->qgroup_rsv_size;
- block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
- } else {
- qgroup_to_release = 0;
- }
- spin_unlock(&block_rsv->lock);
-
- ret = num_bytes;
- if (num_bytes > 0) {
- if (dest) {
- spin_lock(&dest->lock);
- if (!dest->full) {
- u64 bytes_to_add;
-
- bytes_to_add = dest->size - dest->reserved;
- bytes_to_add = min(num_bytes, bytes_to_add);
- dest->reserved += bytes_to_add;
- if (dest->reserved >= dest->size)
- dest->full = 1;
- num_bytes -= bytes_to_add;
- }
- spin_unlock(&dest->lock);
- }
- if (num_bytes)
- space_info_add_old_bytes(fs_info, space_info,
- num_bytes);
- }
- if (qgroup_to_release_ret)
- *qgroup_to_release_ret = qgroup_to_release;
- return ret;
-}
-
-int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
- struct btrfs_block_rsv *dst, u64 num_bytes,
- bool update_size)
-{
- int ret;
-
- ret = block_rsv_use_bytes(src, num_bytes);
- if (ret)
- return ret;
-
- block_rsv_add_bytes(dst, num_bytes, update_size);
- return 0;
-}
-
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
-{
- memset(rsv, 0, sizeof(*rsv));
- spin_lock_init(&rsv->lock);
- rsv->type = type;
-}
-
-void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *rsv,
- unsigned short type)
-{
- btrfs_init_block_rsv(rsv, type);
- rsv->space_info = __find_space_info(fs_info,
- BTRFS_BLOCK_GROUP_METADATA);
-}
-
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
- unsigned short type)
-{
- struct btrfs_block_rsv *block_rsv;
-
- block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
- if (!block_rsv)
- return NULL;
-
- btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
- return block_rsv;
-}
-
-void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *rsv)
-{
- if (!rsv)
- return;
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
- kfree(rsv);
-}
-
-int btrfs_block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, u64 num_bytes,
- enum btrfs_reserve_flush_enum flush)
-{
- int ret;
-
- if (num_bytes == 0)
- return 0;
-
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
- if (!ret)
- block_rsv_add_bytes(block_rsv, num_bytes, true);
-
- return ret;
-}
-
-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
-{
- u64 num_bytes = 0;
- int ret = -ENOSPC;
-
- if (!block_rsv)
- return 0;
-
- spin_lock(&block_rsv->lock);
- num_bytes = div_factor(block_rsv->size, min_factor);
- if (block_rsv->reserved >= num_bytes)
- ret = 0;
- spin_unlock(&block_rsv->lock);
-
- return ret;
-}
-
-int btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, u64 min_reserved,
- enum btrfs_reserve_flush_enum flush)
-{
- u64 num_bytes = 0;
- int ret = -ENOSPC;
-
- if (!block_rsv)
- return 0;
-
- spin_lock(&block_rsv->lock);
- num_bytes = min_reserved;
- if (block_rsv->reserved >= num_bytes)
- ret = 0;
- else
- num_bytes -= block_rsv->reserved;
- spin_unlock(&block_rsv->lock);
-
- if (!ret)
- return 0;
-
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
- if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, false);
- return 0;
- }
-
- return ret;
-}
-
-static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv,
- u64 *metadata_bytes, u64 *qgroup_bytes)
-{
- *metadata_bytes = 0;
- *qgroup_bytes = 0;
-
- spin_lock(&block_rsv->lock);
- if (block_rsv->reserved < block_rsv->size)
- *metadata_bytes = block_rsv->size - block_rsv->reserved;
- if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
- *qgroup_bytes = block_rsv->qgroup_rsv_size -
- block_rsv->qgroup_rsv_reserved;
- spin_unlock(&block_rsv->lock);
-}
-
-/**
- * btrfs_inode_rsv_refill - refill the inode block rsv.
- * @inode - the inode we are refilling.
- * @flush - the flushing restriction.
- *
- * Essentially the same as btrfs_block_rsv_refill, except it uses the
- * block_rsv->size as the minimum size. We'll either refill the missing amount
- * or return if we already have enough space. This will also handle the reserve
- * tracepoint for the reserved amount.
- */
-static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
- enum btrfs_reserve_flush_enum flush)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
- u64 num_bytes, last = 0;
- u64 qgroup_num_bytes;
- int ret = -ENOSPC;
-
- calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
- if (num_bytes == 0)
- return 0;
-
- do {
- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes,
- true);
- if (ret)
- return ret;
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
- if (ret) {
- btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
- last = num_bytes;
- /*
- * If we are fragmented we can end up with a lot of
- * outstanding extents which will make our size be much
- * larger than our reserved amount.
- *
- * If the reservation happens here, it might be very
- * big though not needed in the end, if the delalloc
- * flushing happens.
- *
- * If this is the case try and do the reserve again.
- */
- if (flush == BTRFS_RESERVE_FLUSH_ALL)
- calc_refill_bytes(block_rsv, &num_bytes,
- &qgroup_num_bytes);
- if (num_bytes == 0)
- return 0;
- }
- } while (ret && last != num_bytes);
-
- if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, false);
- trace_btrfs_space_reservation(root->fs_info, "delalloc",
- btrfs_ino(inode), num_bytes, 1);
-
- /* Don't forget to increase qgroup_rsv_reserved */
- spin_lock(&block_rsv->lock);
- block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
- spin_unlock(&block_rsv->lock);
- }
- return ret;
-}
-
-static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, u64 *qgroup_to_release)
-{
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
- struct btrfs_block_rsv *target = delayed_rsv;
-
- if (target->full || target == block_rsv)
- target = global_rsv;
-
- if (block_rsv->space_info != target->space_info)
- target = NULL;
-
- return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
- qgroup_to_release);
-}
-
-void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
-}
-
-/**
- * btrfs_inode_rsv_release - release any excessive reservation.
- * @inode - the inode we need to release from.
- * @qgroup_free - free or convert qgroup meta.
- * Unlike normal operation, qgroup meta reservation needs to know if we are
- * freeing qgroup reservation or just converting it into per-trans. Normally
- * @qgroup_free is true for error handling, and false for normal release.
- *
- * This is the same as btrfs_block_rsv_release, except that it handles the
- * tracepoint for the reservation.
- */
-static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
- u64 released = 0;
- u64 qgroup_to_release = 0;
-
- /*
- * Since we statically set the block_rsv->size we just want to say we
- * are releasing 0 bytes, and then we'll just get the reservation over
- * the size free'd.
- */
- released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
- &qgroup_to_release);
- if (released > 0)
- trace_btrfs_space_reservation(fs_info, "delalloc",
- btrfs_ino(inode), released, 0);
- if (qgroup_free)
- btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
- else
- btrfs_qgroup_convert_reserved_meta(inode->root,
- qgroup_to_release);
-}
-
-/**
- * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
- * @fs_info - the fs_info for our fs.
- * @nr - the number of items to drop.
- *
- * This drops the delayed ref head's count from the delayed refs rsv and frees
- * any excess reservation we had.
- */
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
-{
- struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
- u64 released = 0;
-
- released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
- num_bytes, NULL);
- if (released)
- trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
- 0, released, 0);
-}
-
-static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
- struct btrfs_space_info *sinfo = block_rsv->space_info;
- u64 num_bytes;
-
- /*
- * The global block rsv is based on the size of the extent tree, the
- * checksum tree and the root tree. If the fs is empty we want to set
- * it to a minimal amount for safety.
- */
- num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
- btrfs_root_used(&fs_info->csum_root->root_item) +
- btrfs_root_used(&fs_info->tree_root->root_item);
- num_bytes = max_t(u64, num_bytes, SZ_16M);
-
- spin_lock(&sinfo->lock);
- spin_lock(&block_rsv->lock);
-
- block_rsv->size = min_t(u64, num_bytes, SZ_512M);
-
- if (block_rsv->reserved < block_rsv->size) {
- num_bytes = btrfs_space_info_used(sinfo, true);
- if (sinfo->total_bytes > num_bytes) {
- num_bytes = sinfo->total_bytes - num_bytes;
- num_bytes = min(num_bytes,
- block_rsv->size - block_rsv->reserved);
- block_rsv->reserved += num_bytes;
- update_bytes_may_use(sinfo, num_bytes);
- trace_btrfs_space_reservation(fs_info, "space_info",
- sinfo->flags, num_bytes,
- 1);
- }
- } else if (block_rsv->reserved > block_rsv->size) {
- num_bytes = block_rsv->reserved - block_rsv->size;
- update_bytes_may_use(sinfo, -num_bytes);
- trace_btrfs_space_reservation(fs_info, "space_info",
- sinfo->flags, num_bytes, 0);
- block_rsv->reserved = block_rsv->size;
- }
-
- if (block_rsv->reserved == block_rsv->size)
- block_rsv->full = 1;
- else
- block_rsv->full = 0;
-
- spin_unlock(&block_rsv->lock);
- spin_unlock(&sinfo->lock);
-}
-
-static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_space_info *space_info;
-
- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
- fs_info->chunk_block_rsv.space_info = space_info;
-
- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
- fs_info->global_block_rsv.space_info = space_info;
- fs_info->trans_block_rsv.space_info = space_info;
- fs_info->empty_block_rsv.space_info = space_info;
- fs_info->delayed_block_rsv.space_info = space_info;
- fs_info->delayed_refs_rsv.space_info = space_info;
-
- fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
- fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
- fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
- fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
- if (fs_info->quota_root)
- fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
- fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
-
- update_global_block_rsv(fs_info);
-}
-
-static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
-{
- block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
- (u64)-1, NULL);
- WARN_ON(fs_info->trans_block_rsv.size > 0);
- WARN_ON(fs_info->trans_block_rsv.reserved > 0);
- WARN_ON(fs_info->chunk_block_rsv.size > 0);
- WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
- WARN_ON(fs_info->delayed_block_rsv.size > 0);
- WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
- WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
- WARN_ON(fs_info->delayed_refs_rsv.size > 0);
-}
-
-/*
- * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
- * @trans - the trans that may have generated delayed refs
- *
- * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
- * it'll calculate the additional size and add it to the delayed_refs_rsv.
- */
-void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
- u64 num_bytes;
-
- if (!trans->delayed_ref_updates)
- return;
-
- num_bytes = btrfs_calc_trans_metadata_size(fs_info,
- trans->delayed_ref_updates);
- spin_lock(&delayed_rsv->lock);
- delayed_rsv->size += num_bytes;
- delayed_rsv->full = 0;
- spin_unlock(&delayed_rsv->lock);
- trans->delayed_ref_updates = 0;
-}
-
-/*
- * To be called after all the new block groups attached to the transaction
- * handle have been created (btrfs_create_pending_block_groups()).
- */
-void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
-
- if (!trans->chunk_bytes_reserved)
- return;
-
- WARN_ON_ONCE(!list_empty(&trans->new_bgs));
-
- block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
- trans->chunk_bytes_reserved, NULL);
- trans->chunk_bytes_reserved = 0;
-}
-
-/*
- * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
- * root: the root of the parent directory
- * rsv: block reservation
- * items: the number of items that we need do reservation
- * use_global_rsv: allow fallback to the global block reservation
- *
- * This function is used to reserve the space for snapshot/subvolume
- * creation and deletion. Those operations are different with the
- * common file/directory operations, they change two fs/file trees
- * and root tree, the number of items that the qgroup reserves is
- * different with the free space reservation. So we can not use
- * the space reservation mechanism in start_transaction().
- */
-int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv, int items,
- bool use_global_rsv)
-{
- u64 qgroup_num_bytes = 0;
- u64 num_bytes;
- int ret;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
- /* One for parent inode, two for dir entries */
- qgroup_num_bytes = 3 * fs_info->nodesize;
- ret = btrfs_qgroup_reserve_meta_prealloc(root,
- qgroup_num_bytes, true);
- if (ret)
- return ret;
- }
-
- num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
- rsv->space_info = __find_space_info(fs_info,
- BTRFS_BLOCK_GROUP_METADATA);
- ret = btrfs_block_rsv_add(root, rsv, num_bytes,
- BTRFS_RESERVE_FLUSH_ALL);
-
- if (ret == -ENOSPC && use_global_rsv)
- ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
-
- if (ret && qgroup_num_bytes)
- btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
-
- return ret;
-}
-
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *rsv)
-{
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
-}
-
-static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
- struct btrfs_inode *inode)
-{
- struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
- u64 reserve_size = 0;
- u64 qgroup_rsv_size = 0;
- u64 csum_leaves;
- unsigned outstanding_extents;
-
- lockdep_assert_held(&inode->lock);
- outstanding_extents = inode->outstanding_extents;
- if (outstanding_extents)
- reserve_size = btrfs_calc_trans_metadata_size(fs_info,
- outstanding_extents + 1);
- csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
- inode->csum_bytes);
- reserve_size += btrfs_calc_trans_metadata_size(fs_info,
- csum_leaves);
- /*
- * For qgroup rsv, the calculation is very simple:
- * account one nodesize for each outstanding extent
- *
- * This is overestimating in most cases.
- */
- qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
-
- spin_lock(&block_rsv->lock);
- block_rsv->size = reserve_size;
- block_rsv->qgroup_rsv_size = qgroup_rsv_size;
- spin_unlock(&block_rsv->lock);
-}
-
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- unsigned nr_extents;
- enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
- int ret = 0;
- bool delalloc_lock = true;
-
- /* If we are a free space inode we need to not flush since we will be in
- * the middle of a transaction commit. We also don't need the delalloc
- * mutex since we won't race with anybody. We need this mostly to make
- * lockdep shut its filthy mouth.
- *
- * If we have a transaction open (can happen if we call truncate_block
- * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
- */
- if (btrfs_is_free_space_inode(inode)) {
- flush = BTRFS_RESERVE_NO_FLUSH;
- delalloc_lock = false;
- } else {
- if (current->journal_info)
- flush = BTRFS_RESERVE_FLUSH_LIMIT;
-
- if (btrfs_transaction_in_commit(fs_info))
- schedule_timeout(1);
- }
-
- if (delalloc_lock)
- mutex_lock(&inode->delalloc_mutex);
-
- num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
-
- /* Add our new extents and calculate the new rsv size. */
- spin_lock(&inode->lock);
- nr_extents = count_max_extents(num_bytes);
- btrfs_mod_outstanding_extents(inode, nr_extents);
- inode->csum_bytes += num_bytes;
- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
- spin_unlock(&inode->lock);
-
- ret = btrfs_inode_rsv_refill(inode, flush);
- if (unlikely(ret))
- goto out_fail;
-
- if (delalloc_lock)
- mutex_unlock(&inode->delalloc_mutex);
- return 0;
-
-out_fail:
- spin_lock(&inode->lock);
- nr_extents = count_max_extents(num_bytes);
- btrfs_mod_outstanding_extents(inode, -nr_extents);
- inode->csum_bytes -= num_bytes;
- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
- spin_unlock(&inode->lock);
-
- btrfs_inode_rsv_release(inode, true);
- if (delalloc_lock)
- mutex_unlock(&inode->delalloc_mutex);
- return ret;
-}
-
-/**
- * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
- * @inode: the inode to release the reservation for.
- * @num_bytes: the number of bytes we are releasing.
- * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
- *
- * This will release the metadata reservation for an inode. This can be called
- * once we complete IO for a given set of bytes to release their metadata
- * reservations, or on error for the same reason.
- */
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
- bool qgroup_free)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
-
- num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
- spin_lock(&inode->lock);
- inode->csum_bytes -= num_bytes;
- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
- spin_unlock(&inode->lock);
-
- if (btrfs_is_testing(fs_info))
- return;
-
- btrfs_inode_rsv_release(inode, qgroup_free);
-}
-
-/**
- * btrfs_delalloc_release_extents - release our outstanding_extents
- * @inode: the inode to balance the reservation for.
- * @num_bytes: the number of bytes we originally reserved with
- * @qgroup_free: do we need to free qgroup meta reservation or convert them.
- *
- * When we reserve space we increase outstanding_extents for the extents we may
- * add. Once we've set the range as delalloc or created our ordered extents we
- * have outstanding_extents to track the real usage, so we use this to free our
- * temporarily tracked outstanding_extents. This _must_ be used in conjunction
- * with btrfs_delalloc_reserve_metadata.
- */
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
- bool qgroup_free)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- unsigned num_extents;
-
- spin_lock(&inode->lock);
- num_extents = count_max_extents(num_bytes);
- btrfs_mod_outstanding_extents(inode, -num_extents);
- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
- spin_unlock(&inode->lock);
-
- if (btrfs_is_testing(fs_info))
- return;
-
- btrfs_inode_rsv_release(inode, qgroup_free);
-}
-
-/**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for
- * delalloc
- * @inode: inode we're writing to
- * @start: start range we are writing to
- * @len: how long the range we are writing to
- * @reserved: mandatory parameter, record actually reserved qgroup ranges of
- * current reservation.
- *
- * This will do the following things
- *
- * o reserve space in data space info for num bytes
- * and reserve precious corresponding qgroup space
- * (Done in check_data_free_space)
- *
- * o reserve space for metadata space, based on the number of outstanding
- * extents and how much csums will be needed
- * also reserve metadata space in a per root over-reserve method.
- * o add to the inodes->delalloc_bytes
- * o add it to the fs_info's delalloc inodes list.
- * (Above 3 all done in delalloc_reserve_metadata)
- *
- * Return 0 for success
- * Return <0 for error(-ENOSPC or -EQUOT)
- */
-int btrfs_delalloc_reserve_space(struct inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len)
-{
- int ret;
-
- ret = btrfs_check_data_free_space(inode, reserved, start, len);
- if (ret < 0)
- return ret;
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
- if (ret < 0)
- btrfs_free_reserved_data_space(inode, *reserved, start, len);
- return ret;
-}
-
-/**
- * btrfs_delalloc_release_space - release data and metadata space for delalloc
- * @inode: inode we're releasing space for
- * @start: start position of the space already reserved
- * @len: the len of the space already reserved
- * @release_bytes: the len of the space we consumed or didn't use
- *
- * This function will release the metadata space that was not used and will
- * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
- * list if there are no delalloc bytes left.
- * Also it will handle the qgroup reserved space.
- */
-void btrfs_delalloc_release_space(struct inode *inode,
- struct extent_changeset *reserved,
- u64 start, u64 len, bool qgroup_free)
-{
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
- btrfs_free_reserved_data_space(inode, reserved, start, len);
-}
-
-static int update_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *info, u64 bytenr,
- u64 num_bytes, int alloc)
-{
- struct btrfs_block_group_cache *cache = NULL;
- u64 total = num_bytes;
- u64 old_val;
- u64 byte_in_group;
- int factor;
- int ret = 0;
-
- /* block accounting for super block */
- spin_lock(&info->delalloc_root_lock);
- old_val = btrfs_super_bytes_used(info->super_copy);
- if (alloc)
- old_val += num_bytes;
- else
- old_val -= num_bytes;
- btrfs_set_super_bytes_used(info->super_copy, old_val);
- spin_unlock(&info->delalloc_root_lock);
-
- while (total) {
- cache = btrfs_lookup_block_group(info, bytenr);
- if (!cache) {
- ret = -ENOENT;
- break;
- }
- factor = btrfs_bg_type_to_factor(cache->flags);
-
- /*
- * If this block group has free space cache written out, we
- * need to make sure to load it if we are removing space. This
- * is because we need the unpinning stage to actually add the
- * space back to the block group, otherwise we will leak space.
- */
- if (!alloc && cache->cached == BTRFS_CACHE_NO)
- cache_block_group(cache, 1);
-
- byte_in_group = bytenr - cache->key.objectid;
- WARN_ON(byte_in_group > cache->key.offset);
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
-
- if (btrfs_test_opt(info, SPACE_CACHE) &&
- cache->disk_cache_state < BTRFS_DC_CLEAR)
- cache->disk_cache_state = BTRFS_DC_CLEAR;
-
- old_val = btrfs_block_group_used(&cache->item);
- num_bytes = min(total, cache->key.offset - byte_in_group);
- if (alloc) {
- old_val += num_bytes;
- btrfs_set_block_group_used(&cache->item, old_val);
- cache->reserved -= num_bytes;
- cache->space_info->bytes_reserved -= num_bytes;
- cache->space_info->bytes_used += num_bytes;
- cache->space_info->disk_used += num_bytes * factor;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- } else {
- old_val -= num_bytes;
- btrfs_set_block_group_used(&cache->item, old_val);
- cache->pinned += num_bytes;
- update_bytes_pinned(cache->space_info, num_bytes);
- cache->space_info->bytes_used -= num_bytes;
- cache->space_info->disk_used -= num_bytes * factor;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-
- trace_btrfs_space_reservation(info, "pinned",
- cache->space_info->flags,
- num_bytes, 1);
- percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
- num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- set_extent_dirty(info->pinned_extents,
- bytenr, bytenr + num_bytes - 1,
- GFP_NOFS | __GFP_NOFAIL);
- }
-
- spin_lock(&trans->transaction->dirty_bgs_lock);
- if (list_empty(&cache->dirty_list)) {
- list_add_tail(&cache->dirty_list,
- &trans->transaction->dirty_bgs);
- trans->transaction->num_dirty_bgs++;
- trans->delayed_ref_updates++;
- btrfs_get_block_group(cache);
- }
- spin_unlock(&trans->transaction->dirty_bgs_lock);
-
- /*
- * No longer have used bytes in this block group, queue it for
- * deletion. We do this after adding the block group to the
- * dirty list to avoid races between cleaner kthread and space
- * cache writeout.
- */
- if (!alloc && old_val == 0)
- btrfs_mark_bg_unused(cache);
-
- btrfs_put_block_group(cache);
- total -= num_bytes;
- bytenr += num_bytes;
- }
-
- /* Modified block groups are accounted for in the delayed_refs_rsv. */
- btrfs_update_delayed_refs_rsv(trans);
+ ret = btrfs_get_alloc_profile(fs_info, flags);
return ret;
}
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
u64 bytenr;
spin_lock(&fs_info->block_group_cache_lock);
@@ -6485,20 +2581,22 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
if (!cache)
return 0;
- bytenr = cache->key.objectid;
+ bytenr = cache->start;
btrfs_put_block_group(cache);
return bytenr;
}
-static int pin_down_extent(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+static int pin_down_extent(struct btrfs_block_group *cache,
u64 bytenr, u64 num_bytes, int reserved)
{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += num_bytes;
- update_bytes_pinned(cache->space_info, num_bytes);
+ btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
+ num_bytes);
if (reserved) {
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
@@ -6506,8 +2604,6 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- trace_btrfs_space_reservation(fs_info, "pinned",
- cache->space_info->flags, num_bytes, 1);
percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
set_extent_dirty(fs_info->pinned_extents, bytenr,
@@ -6515,18 +2611,17 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info,
return 0;
}
-/*
- * this function must be called within transaction
- */
int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, int reserved)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
+
+ ASSERT(fs_info->running_transaction);
cache = btrfs_lookup_block_group(fs_info, bytenr);
BUG_ON(!cache); /* Logic error */
- pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
+ pin_down_extent(cache, bytenr, num_bytes, reserved);
btrfs_put_block_group(cache);
return 0;
@@ -6538,7 +2633,7 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
int ret;
cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -6551,9 +2646,9 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
* to one because the slow code to read in the free extents does check
* the pinned extents.
*/
- cache_block_group(cache, 1);
+ btrfs_cache_block_group(cache, 1);
- pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
+ pin_down_extent(cache, bytenr, num_bytes, 0);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
@@ -6565,25 +2660,26 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes)
{
int ret;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_caching_control *caching_ctl;
block_group = btrfs_lookup_block_group(fs_info, start);
if (!block_group)
return -EINVAL;
- cache_block_group(block_group, 0);
- caching_ctl = get_caching_control(block_group);
+ btrfs_cache_block_group(block_group, 0);
+ caching_ctl = btrfs_get_caching_control(block_group);
if (!caching_ctl) {
/* Logic error */
- BUG_ON(!block_group_cache_done(block_group));
+ BUG_ON(!btrfs_block_group_done(block_group));
ret = btrfs_remove_free_space(block_group, start, num_bytes);
} else {
mutex_lock(&caching_ctl->mutex);
if (start >= caching_ctl->progress) {
- ret = add_excluded_extent(fs_info, start, num_bytes);
+ ret = btrfs_add_excluded_extent(fs_info, start,
+ num_bytes);
} else if (start + num_bytes <= caching_ctl->progress) {
ret = btrfs_remove_free_space(block_group,
start, num_bytes);
@@ -6597,19 +2693,20 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
num_bytes = (start + num_bytes) -
caching_ctl->progress;
start = caching_ctl->progress;
- ret = add_excluded_extent(fs_info, start, num_bytes);
+ ret = btrfs_add_excluded_extent(fs_info, start,
+ num_bytes);
}
out_lock:
mutex_unlock(&caching_ctl->mutex);
- put_caching_control(caching_ctl);
+ btrfs_put_caching_control(caching_ctl);
}
btrfs_put_block_group(block_group);
return ret;
}
-int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
+int btrfs_exclude_logged_extents(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct btrfs_file_extent_item *item;
struct btrfs_key key;
int found_type;
@@ -6640,127 +2737,26 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
}
static void
-btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
+btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
{
atomic_inc(&bg->reservations);
}
-void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
- const u64 start)
-{
- struct btrfs_block_group_cache *bg;
-
- bg = btrfs_lookup_block_group(fs_info, start);
- ASSERT(bg);
- if (atomic_dec_and_test(&bg->reservations))
- wake_up_var(&bg->reservations);
- btrfs_put_block_group(bg);
-}
-
-void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
-{
- struct btrfs_space_info *space_info = bg->space_info;
-
- ASSERT(bg->ro);
-
- if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
- return;
-
- /*
- * Our block group is read only but before we set it to read only,
- * some task might have had allocated an extent from it already, but it
- * has not yet created a respective ordered extent (and added it to a
- * root's list of ordered extents).
- * Therefore wait for any task currently allocating extents, since the
- * block group's reservations counter is incremented while a read lock
- * on the groups' semaphore is held and decremented after releasing
- * the read access on that semaphore and creating the ordered extent.
- */
- down_write(&space_info->groups_sem);
- up_write(&space_info->groups_sem);
-
- wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
-}
-
-/**
- * btrfs_add_reserved_bytes - update the block_group and space info counters
- * @cache: The cache we are manipulating
- * @ram_bytes: The number of bytes of file content, and will be same to
- * @num_bytes except for the compress path.
- * @num_bytes: The number of bytes in question
- * @delalloc: The blocks are allocated for the delalloc write
- *
- * This is called by the allocator when it reserves space. If this is a
- * reservation and the block group has become read only we cannot make the
- * reservation and return -EAGAIN, otherwise this function always succeeds.
- */
-static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 ram_bytes, u64 num_bytes, int delalloc)
-{
- struct btrfs_space_info *space_info = cache->space_info;
- int ret = 0;
-
- spin_lock(&space_info->lock);
- spin_lock(&cache->lock);
- if (cache->ro) {
- ret = -EAGAIN;
- } else {
- cache->reserved += num_bytes;
- space_info->bytes_reserved += num_bytes;
- update_bytes_may_use(space_info, -ram_bytes);
- if (delalloc)
- cache->delalloc_bytes += num_bytes;
- }
- spin_unlock(&cache->lock);
- spin_unlock(&space_info->lock);
- return ret;
-}
-
-/**
- * btrfs_free_reserved_bytes - update the block_group and space info counters
- * @cache: The cache we are manipulating
- * @num_bytes: The number of bytes in question
- * @delalloc: The blocks are allocated for the delalloc write
- *
- * This is called by somebody who is freeing space that was never actually used
- * on disk. For example if you reserve some space for a new leaf in transaction
- * A and before transaction A commits you free that leaf, you call this with
- * reserve set to 0 in order to clear the reservation.
- */
-
-static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int delalloc)
-{
- struct btrfs_space_info *space_info = cache->space_info;
-
- spin_lock(&space_info->lock);
- spin_lock(&cache->lock);
- if (cache->ro)
- space_info->bytes_readonly += num_bytes;
- cache->reserved -= num_bytes;
- space_info->bytes_reserved -= num_bytes;
- space_info->max_extent_size = 0;
-
- if (delalloc)
- cache->delalloc_bytes -= num_bytes;
- spin_unlock(&cache->lock);
- spin_unlock(&space_info->lock);
-}
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
{
struct btrfs_caching_control *next;
struct btrfs_caching_control *caching_ctl;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
down_write(&fs_info->commit_root_sem);
list_for_each_entry_safe(caching_ctl, next,
&fs_info->caching_block_groups, list) {
cache = caching_ctl->block_group;
- if (block_group_cache_done(cache)) {
+ if (btrfs_block_group_done(cache)) {
cache->last_byte_to_unpin = (u64)-1;
list_del_init(&caching_ctl->list);
- put_caching_control(caching_ctl);
+ btrfs_put_caching_control(caching_ctl);
} else {
cache->last_byte_to_unpin = caching_ctl->progress;
}
@@ -6773,7 +2769,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
up_write(&fs_info->commit_root_sem);
- update_global_block_rsv(fs_info);
+ btrfs_update_global_block_rsv(fs_info);
}
/*
@@ -6809,7 +2805,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end,
const bool return_free_space)
{
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_free_cluster *cluster = NULL;
@@ -6821,7 +2817,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
while (start <= end) {
readonly = false;
if (!cache ||
- start >= cache->key.objectid + cache->key.offset) {
+ start >= cache->start + cache->length) {
if (cache)
btrfs_put_block_group(cache);
total_unpinned = 0;
@@ -6834,7 +2830,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
empty_cluster <<= 1;
}
- len = cache->key.objectid + cache->key.offset - start;
+ len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
if (start < cache->last_byte_to_unpin) {
@@ -6863,10 +2859,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
- update_bytes_pinned(space_info, -len);
-
- trace_btrfs_space_reservation(fs_info, "pinned",
- space_info->flags, len, 0);
+ btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
space_info->max_extent_size = 0;
percpu_counter_add_batch(&space_info->total_bytes_pinned,
-len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
@@ -6884,20 +2877,17 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
to_add = min(len, global_rsv->size -
global_rsv->reserved);
global_rsv->reserved += to_add;
- update_bytes_may_use(space_info, to_add);
+ btrfs_space_info_update_bytes_may_use(fs_info,
+ space_info, to_add);
if (global_rsv->reserved >= global_rsv->size)
global_rsv->full = 1;
- trace_btrfs_space_reservation(fs_info,
- "space_info",
- space_info->flags,
- to_add, 1);
len -= to_add;
}
spin_unlock(&global_rsv->lock);
/* Add to any tickets we may have */
if (len)
- space_info_add_new_bytes(fs_info, space_info,
- len);
+ btrfs_try_granting_tickets(fs_info,
+ space_info);
}
spin_unlock(&space_info->lock);
}
@@ -6910,7 +2900,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_block_group *block_group, *tmp;
struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
u64 start;
@@ -6956,8 +2946,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
ret = -EROFS;
if (!trans->aborted)
ret = btrfs_discard_extent(fs_info,
- block_group->key.objectid,
- block_group->key.offset,
+ block_group->start,
+ block_group->length,
&trimmed);
list_del_init(&block_group->bg_list);
@@ -7185,7 +3175,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
if (is_data) {
- ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
+ ret = btrfs_del_csums(trans, info->csum_root, bytenr,
+ num_bytes);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -7198,7 +3189,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
- ret = update_block_group(trans, info, bytenr, num_bytes, 0);
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -7272,28 +3263,27 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
u64 parent, int last_ref)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ref generic_ref = { 0 };
int pin = 1;
int ret;
+ btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
+ buf->start, buf->len, parent);
+ btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
+ root->root_key.objectid);
+
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
int old_ref_mod, new_ref_mod;
- btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
- root->root_key.objectid,
- btrfs_header_level(buf), 0,
- BTRFS_DROP_DELAYED_REF);
- ret = btrfs_add_delayed_tree_ref(trans, buf->start,
- buf->len, parent,
- root->root_key.objectid,
- btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL,
+ btrfs_ref_tree_mod(fs_info, &generic_ref);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
&old_ref_mod, &new_ref_mod);
BUG_ON(ret); /* -ENOMEM */
pin = old_ref_mod >= 0 && new_ref_mod < 0;
}
if (last_ref && btrfs_header_generation(buf) == trans->transid) {
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start);
@@ -7305,8 +3295,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(fs_info, cache, buf->start,
- buf->len, 1);
+ pin_down_extent(cache, buf->start, buf->len, 1);
btrfs_put_block_group(cache);
goto out;
}
@@ -7320,8 +3309,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
}
out:
if (pin)
- add_pinned_bytes(fs_info, buf->len, true,
- root->root_key.objectid);
+ add_pinned_bytes(fs_info, &generic_ref);
if (last_ref) {
/*
@@ -7333,120 +3321,63 @@ out:
}
/* Can return -ENOMEM */
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset)
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int old_ref_mod, new_ref_mod;
int ret;
if (btrfs_is_testing(fs_info))
return 0;
- if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
- btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
- root_objectid, owner, offset,
- BTRFS_DROP_DELAYED_REF);
-
/*
* tree log blocks never actually go into the extent allocation
* tree, just update pinning info and exit early.
*/
- if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
- WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
+ if ((ref->type == BTRFS_REF_METADATA &&
+ ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+ (ref->type == BTRFS_REF_DATA &&
+ ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
- btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
+ btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
old_ref_mod = new_ref_mod = 0;
ret = 0;
- } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(trans, bytenr,
- num_bytes, parent,
- root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL,
+ } else if (ref->type == BTRFS_REF_METADATA) {
+ ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
&old_ref_mod, &new_ref_mod);
} else {
- ret = btrfs_add_delayed_data_ref(trans, bytenr,
- num_bytes, parent,
- root_objectid, owner, offset,
- 0, BTRFS_DROP_DELAYED_REF,
+ ret = btrfs_add_delayed_data_ref(trans, ref, 0,
&old_ref_mod, &new_ref_mod);
}
- if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
- bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+ if (!((ref->type == BTRFS_REF_METADATA &&
+ ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+ (ref->type == BTRFS_REF_DATA &&
+ ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
+ btrfs_ref_tree_mod(fs_info, ref);
- add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
- }
+ if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
+ add_pinned_bytes(fs_info, ref);
return ret;
}
-/*
- * when we wait for progress in the block group caching, its because
- * our allocation attempt failed at least once. So, we must sleep
- * and let some progress happen before we try again.
- *
- * This function will sleep at least once waiting for new free space to
- * show up, and then it will check the block group free space numbers
- * for our min num_bytes. Another option is to have it go ahead
- * and look in the rbtree for a free extent of a given size, but this
- * is a good start.
- *
- * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
- * any of the information in this block group.
- */
-static noinline void
-wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
- u64 num_bytes)
-{
- struct btrfs_caching_control *caching_ctl;
-
- caching_ctl = get_caching_control(cache);
- if (!caching_ctl)
- return;
-
- wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
- (cache->free_space_ctl->free_space >= num_bytes));
-
- put_caching_control(caching_ctl);
-}
-
-static noinline int
-wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
-{
- struct btrfs_caching_control *caching_ctl;
- int ret = 0;
-
- caching_ctl = get_caching_control(cache);
- if (!caching_ctl)
- return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
-
- wait_event(caching_ctl->wait, block_group_cache_done(cache));
- if (cache->cached == BTRFS_CACHE_ERROR)
- ret = -EIO;
- put_caching_control(caching_ctl);
- return ret;
-}
-
enum btrfs_loop_type {
- LOOP_CACHING_NOWAIT = 0,
- LOOP_CACHING_WAIT = 1,
- LOOP_ALLOC_CHUNK = 2,
- LOOP_NO_EMPTY_SIZE = 3,
+ LOOP_CACHING_NOWAIT,
+ LOOP_CACHING_WAIT,
+ LOOP_ALLOC_CHUNK,
+ LOOP_NO_EMPTY_SIZE,
};
static inline void
-btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
+btrfs_lock_block_group(struct btrfs_block_group *cache,
int delalloc)
{
if (delalloc)
down_read(&cache->data_rwsem);
}
-static inline void
-btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
int delalloc)
{
btrfs_get_block_group(cache);
@@ -7454,12 +3385,12 @@ btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
down_read(&cache->data_rwsem);
}
-static struct btrfs_block_group_cache *
-btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+static struct btrfs_block_group *btrfs_lock_cluster(
+ struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
int delalloc)
{
- struct btrfs_block_group_cache *used_bg = NULL;
+ struct btrfs_block_group *used_bg = NULL;
spin_lock(&cluster->refill_lock);
while (1) {
@@ -7493,7 +3424,7 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
}
static inline void
-btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+btrfs_release_block_group(struct btrfs_block_group *cache,
int delalloc)
{
if (delalloc)
@@ -7564,13 +3495,12 @@ struct find_free_extent_ctl {
* Return >0 to inform caller that we find nothing
* Return 0 means we have found a location and set ffe_ctl->found_offset.
*/
-static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
+static int find_free_extent_clustered(struct btrfs_block_group *bg,
struct btrfs_free_cluster *last_ptr,
struct find_free_extent_ctl *ffe_ctl,
- struct btrfs_block_group_cache **cluster_bg_ret)
+ struct btrfs_block_group **cluster_bg_ret)
{
- struct btrfs_fs_info *fs_info = bg->fs_info;
- struct btrfs_block_group_cache *cluster_bg;
+ struct btrfs_block_group *cluster_bg;
u64 aligned_cluster;
u64 offset;
int ret;
@@ -7583,7 +3513,7 @@ static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
goto release_cluster;
offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
- ffe_ctl->num_bytes, cluster_bg->key.objectid,
+ ffe_ctl->num_bytes, cluster_bg->start,
&ffe_ctl->max_extent_size);
if (offset) {
/* We have a block, we're done */
@@ -7629,9 +3559,8 @@ refill_cluster:
aligned_cluster = max_t(u64,
ffe_ctl->empty_cluster + ffe_ctl->empty_size,
bg->full_stripe_len);
- ret = btrfs_find_space_cluster(fs_info, bg, last_ptr,
- ffe_ctl->search_start, ffe_ctl->num_bytes,
- aligned_cluster);
+ ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
+ ffe_ctl->num_bytes, aligned_cluster);
if (ret == 0) {
/* Now pull our allocation out of this cluster */
offset = btrfs_alloc_from_cluster(bg, last_ptr,
@@ -7651,7 +3580,7 @@ refill_cluster:
spin_unlock(&last_ptr->refill_lock);
ffe_ctl->retry_clustered = true;
- wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+ btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
ffe_ctl->empty_cluster + ffe_ctl->empty_size);
return -EAGAIN;
}
@@ -7670,7 +3599,7 @@ refill_cluster:
* Return 0 when we found an free extent and set ffe_ctrl->found_offset
* Return -EAGAIN to inform caller that we need to re-search this block group
*/
-static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
+static int find_free_extent_unclustered(struct btrfs_block_group *bg,
struct btrfs_free_cluster *last_ptr,
struct find_free_extent_ctl *ffe_ctl)
{
@@ -7718,8 +3647,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
*/
if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
- wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
- ffe_ctl->empty_size);
+ btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+ ffe_ctl->empty_size);
ffe_ctl->retry_unclustered = true;
return -EAGAIN;
} else if (!offset) {
@@ -7802,8 +3731,8 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return ret;
}
- ret = do_chunk_alloc(trans, ffe_ctl->flags,
- CHUNK_ALLOC_FORCE);
+ ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
+ CHUNK_ALLOC_FORCE);
/*
* If we can't allocate a new chunk we've already looped
@@ -7871,8 +3800,9 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 flags, int delalloc)
{
int ret = 0;
+ int cache_block_group_error = 0;
struct btrfs_free_cluster *last_ptr = NULL;
- struct btrfs_block_group_cache *block_group = NULL;
+ struct btrfs_block_group *block_group = NULL;
struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
bool use_cluster = true;
@@ -7899,7 +3829,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
- space_info = __find_space_info(fs_info, flags);
+ space_info = btrfs_find_space_info(fs_info, flags);
if (!space_info) {
btrfs_err(fs_info, "No space info for %llu", flags);
return -ENOSPC;
@@ -7995,7 +3925,7 @@ search:
continue;
btrfs_grab_block_group(block_group, delalloc);
- ffe_ctl.search_start = block_group->key.objectid;
+ ffe_ctl.search_start = block_group->start;
/*
* this can happen if we end up cycling through all the
@@ -8004,9 +3934,8 @@ search:
*/
if (!block_group_bits(block_group, flags)) {
u64 extra = BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6 |
+ BTRFS_BLOCK_GROUP_RAID1_MASK |
+ BTRFS_BLOCK_GROUP_RAID56_MASK |
BTRFS_BLOCK_GROUP_RAID10;
/*
@@ -8016,14 +3945,35 @@ search:
*/
if ((flags & extra) && !(block_group->flags & extra))
goto loop;
+
+ /*
+ * This block group has different flags than we want.
+ * It's possible that we have MIXED_GROUP flag but no
+ * block group is mixed. Just skip such block group.
+ */
+ btrfs_release_block_group(block_group, delalloc);
+ continue;
}
have_block_group:
- ffe_ctl.cached = block_group_cache_done(block_group);
+ ffe_ctl.cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl.cached)) {
ffe_ctl.have_caching_bg = true;
- ret = cache_block_group(block_group, 0);
- BUG_ON(ret < 0);
+ ret = btrfs_cache_block_group(block_group, 0);
+
+ /*
+ * If we get ENOMEM here or something else we want to
+ * try other block groups, because it may not be fatal.
+ * However if we can't find anything else we need to
+ * save our return here so that we return the actual
+ * error that caused problems, not ENOSPC.
+ */
+ if (ret < 0) {
+ if (!cache_block_group_error)
+ cache_block_group_error = ret;
+ ret = 0;
+ goto loop;
+ }
ret = 0;
}
@@ -8035,7 +3985,7 @@ have_block_group:
* lets look there
*/
if (last_ptr && use_cluster) {
- struct btrfs_block_group_cache *cluster_bg = NULL;
+ struct btrfs_block_group *cluster_bg = NULL;
ret = find_free_extent_clustered(block_group, last_ptr,
&ffe_ctl, &cluster_bg);
@@ -8068,7 +4018,7 @@ checks:
/* move on to the next group */
if (ffe_ctl.search_start + num_bytes >
- block_group->key.objectid + block_group->key.offset) {
+ block_group->start + block_group->length) {
btrfs_add_free_space(block_group, ffe_ctl.found_offset,
num_bytes);
goto loop;
@@ -8110,7 +4060,7 @@ loop:
if (ret > 0)
goto search;
- if (ret == -ENOSPC) {
+ if (ret == -ENOSPC && !cache_block_group_error) {
/*
* Use ffe_ctl->total_free_space as fallback if we can't find
* any contiguous hole.
@@ -8121,64 +4071,12 @@ loop:
space_info->max_extent_size = ffe_ctl.max_extent_size;
spin_unlock(&space_info->lock);
ins->offset = ffe_ctl.max_extent_size;
+ } else if (ret == -ENOSPC) {
+ ret = cache_block_group_error;
}
return ret;
}
-#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
-do { \
- struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
- spin_lock(&__rsv->lock); \
- btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
- __rsv->size, __rsv->reserved); \
- spin_unlock(&__rsv->lock); \
-} while (0)
-
-static void dump_space_info(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *info, u64 bytes,
- int dump_block_groups)
-{
- struct btrfs_block_group_cache *cache;
- int index = 0;
-
- spin_lock(&info->lock);
- btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
- info->flags,
- info->total_bytes - btrfs_space_info_used(info, true),
- info->full ? "" : "not ");
- btrfs_info(fs_info,
- "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
- info->total_bytes, info->bytes_used, info->bytes_pinned,
- info->bytes_reserved, info->bytes_may_use,
- info->bytes_readonly);
- spin_unlock(&info->lock);
-
- DUMP_BLOCK_RSV(fs_info, global_block_rsv);
- DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
- DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
- DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
- DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
-
- if (!dump_block_groups)
- return;
-
- down_read(&info->groups_sem);
-again:
- list_for_each_entry(cache, &info->block_groups[index], list) {
- spin_lock(&cache->lock);
- btrfs_info(fs_info,
- "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
- cache->key.objectid, cache->key.offset,
- btrfs_block_group_used(&cache->item), cache->pinned,
- cache->reserved, cache->ro ? "[readonly]" : "");
- btrfs_dump_free_space(cache, bytes);
- spin_unlock(&cache->lock);
- }
- if (++index < BTRFS_NR_RAID_TYPES)
- goto again;
- up_read(&info->groups_sem);
-}
-
/*
* btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
* hole that is at least as big as @num_bytes.
@@ -8254,12 +4152,13 @@ again:
} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
struct btrfs_space_info *sinfo;
- sinfo = __find_space_info(fs_info, flags);
+ sinfo = btrfs_find_space_info(fs_info, flags);
btrfs_err(fs_info,
"allocation failed flags %llu, wanted %llu",
flags, num_bytes);
if (sinfo)
- dump_space_info(fs_info, sinfo, num_bytes, 1);
+ btrfs_dump_space_info(fs_info, sinfo,
+ num_bytes, 1);
}
}
@@ -8270,7 +4169,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len,
int pin, int delalloc)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
int ret = 0;
cache = btrfs_lookup_block_group(fs_info, start);
@@ -8281,7 +4180,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
}
if (pin)
- pin_down_extent(fs_info, cache, start, len, 1);
+ pin_down_extent(cache, start, len, 1);
else {
if (btrfs_test_opt(fs_info, DISCARD))
ret = btrfs_discard_extent(fs_info, start, len, NULL);
@@ -8370,7 +4269,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
+ ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
@@ -8460,8 +4359,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = update_block_group(trans, fs_info, extent_key.objectid,
- fs_info->nodesize, 1);
+ ret = btrfs_update_block_group(trans, extent_key.objectid,
+ fs_info->nodesize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
extent_key.objectid, extent_key.offset);
@@ -8478,19 +4377,17 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins)
{
+ struct btrfs_ref generic_ref = { 0 };
int ret;
BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
- btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
- root->root_key.objectid, owner, offset,
- BTRFS_ADD_DELAYED_EXTENT);
-
- ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
- ins->offset, 0,
- root->root_key.objectid, owner,
- offset, ram_bytes,
- BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
+ btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
+ ins->objectid, ins->offset, 0);
+ btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
+ btrfs_ref_tree_mod(root->fs_info, &generic_ref);
+ ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
+ ram_bytes, NULL, NULL);
return ret;
}
@@ -8505,7 +4402,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
/*
@@ -8563,7 +4460,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
- clean_tree_block(fs_info, buf);
+ btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking_write(buf);
@@ -8599,73 +4496,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return buf;
}
-static struct btrfs_block_rsv *
-use_block_rsv(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u32 blocksize)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *block_rsv;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- int ret;
- bool global_updated = false;
-
- block_rsv = get_block_rsv(trans, root);
-
- if (unlikely(block_rsv->size == 0))
- goto try_reserve;
-again:
- ret = block_rsv_use_bytes(block_rsv, blocksize);
- if (!ret)
- return block_rsv;
-
- if (block_rsv->failfast)
- return ERR_PTR(ret);
-
- if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
- global_updated = true;
- update_global_block_rsv(fs_info);
- goto again;
- }
-
- /*
- * The global reserve still exists to save us from ourselves, so don't
- * warn_on if we are short on our delayed refs reserve.
- */
- if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
- btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
- static DEFINE_RATELIMIT_STATE(_rs,
- DEFAULT_RATELIMIT_INTERVAL * 10,
- /*DEFAULT_RATELIMIT_BURST*/ 1);
- if (__ratelimit(&_rs))
- WARN(1, KERN_DEBUG
- "BTRFS: block rsv returned %d\n", ret);
- }
-try_reserve:
- ret = reserve_metadata_bytes(root, block_rsv, blocksize,
- BTRFS_RESERVE_NO_FLUSH);
- if (!ret)
- return block_rsv;
- /*
- * If we couldn't reserve metadata bytes try and use some from
- * the global reserve if its space type is the same as the global
- * reservation.
- */
- if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
- block_rsv->space_info == global_rsv->space_info) {
- ret = block_rsv_use_bytes(global_rsv, blocksize);
- if (!ret)
- return global_rsv;
- }
- return ERR_PTR(ret);
-}
-
-static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv, u32 blocksize)
-{
- block_rsv_add_bytes(block_rsv, blocksize, false);
- block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
-}
-
/*
* finds a free extent and does all the dirty work required for allocation
* returns the tree buffer or an ERR_PTR on error.
@@ -8682,6 +4512,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
struct btrfs_delayed_extent_op *extent_op;
+ struct btrfs_ref generic_ref = { 0 };
u64 flags = 0;
int ret;
u32 blocksize = fs_info->nodesize;
@@ -8697,7 +4528,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
}
#endif
- block_rsv = use_block_rsv(trans, root, blocksize);
+ block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
@@ -8736,13 +4567,12 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->is_data = false;
extent_op->level = level;
- btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
- root_objectid, level, 0,
- BTRFS_ADD_DELAYED_EXTENT);
- ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
- ins.offset, parent,
- root_objectid, level,
- BTRFS_ADD_DELAYED_EXTENT,
+ btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
+ ins.objectid, ins.offset, parent);
+ generic_ref.real_root = root->root_key.objectid;
+ btrfs_init_tree_ref(&generic_ref, level, root_objectid);
+ btrfs_ref_tree_mod(fs_info, &generic_ref);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
extent_op, NULL, NULL);
if (ret)
goto out_free_delayed;
@@ -8756,7 +4586,7 @@ out_free_buf:
out_free_reserved:
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
out_unuse:
- unuse_block_rsv(fs_info, block_rsv, blocksize);
+ btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
}
@@ -8918,7 +4748,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
+ ret = btrfs_set_disk_extent_flags(trans, eb->start,
eb->len, flag,
btrfs_header_level(eb), 0);
BUG_ON(ret); /* -ENOMEM */
@@ -8987,6 +4817,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 parent;
struct btrfs_key key;
struct btrfs_key first_key;
+ struct btrfs_ref ref = { 0 };
struct extent_buffer *next;
int level = wc->level;
int reada = 0;
@@ -9159,9 +4990,10 @@ skip:
wc->drop_level = level;
find_next_key(path, level, &wc->drop_progress);
- ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize,
- parent, root->root_key.objectid,
- level - 1, 0);
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
+ fs_info->nodesize, parent);
+ btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
+ ret = btrfs_free_extent(trans, &ref);
if (ret)
goto out_unlock;
}
@@ -9251,21 +5083,23 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
else
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_qgroup_trace_leaf_items(trans, eb);
- if (ret) {
- btrfs_err_rl(fs_info,
- "error %d accounting leaf items. Quota is out of sync, rescan required.",
+ if (is_fstree(root->root_key.objectid)) {
+ ret = btrfs_qgroup_trace_leaf_items(trans, eb);
+ if (ret) {
+ btrfs_err_rl(fs_info,
+ "error %d accounting leaf items, quota is out of sync, rescan required",
ret);
+ }
}
}
- /* make block locked assertion in clean_tree_block happy */
+ /* make block locked assertion in btrfs_clean_tree_block happy */
if (!path->locks[level] &&
btrfs_header_generation(eb) == trans->transid) {
btrfs_tree_lock(eb);
btrfs_set_lock_blocking_write(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
}
- clean_tree_block(fs_info, eb);
+ btrfs_clean_tree_block(eb);
}
if (eb == root->node) {
@@ -9638,7 +5472,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
btrfs_assert_tree_locked(parent);
parent_level = btrfs_header_level(parent);
- extent_buffer_get(parent);
+ atomic_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
@@ -9676,195 +5510,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return ret;
}
-static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
-{
- u64 num_devices;
- u64 stripped;
-
- /*
- * if restripe for this chunk_type is on pick target profile and
- * return, otherwise do the usual balance
- */
- stripped = get_restripe_target(fs_info, flags);
- if (stripped)
- return extended_to_chunk(stripped);
-
- num_devices = fs_info->fs_devices->rw_devices;
-
- stripped = BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
-
- if (num_devices == 1) {
- stripped |= BTRFS_BLOCK_GROUP_DUP;
- stripped = flags & ~stripped;
-
- /* turn raid0 into single device chunks */
- if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return stripped;
-
- /* turn mirroring into duplication */
- if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10))
- return stripped | BTRFS_BLOCK_GROUP_DUP;
- } else {
- /* they already had raid on here, just return */
- if (flags & stripped)
- return flags;
-
- stripped |= BTRFS_BLOCK_GROUP_DUP;
- stripped = flags & ~stripped;
-
- /* switch duplicated blocks with raid1 */
- if (flags & BTRFS_BLOCK_GROUP_DUP)
- return stripped | BTRFS_BLOCK_GROUP_RAID1;
-
- /* this is drive concat, leave it alone */
- }
-
- return flags;
-}
-
-static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
-{
- struct btrfs_space_info *sinfo = cache->space_info;
- u64 num_bytes;
- u64 sinfo_used;
- u64 min_allocable_bytes;
- int ret = -ENOSPC;
-
- /*
- * We need some metadata space and system metadata space for
- * allocating chunks in some corner cases until we force to set
- * it to be readonly.
- */
- if ((sinfo->flags &
- (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
- !force)
- min_allocable_bytes = SZ_1M;
- else
- min_allocable_bytes = 0;
-
- spin_lock(&sinfo->lock);
- spin_lock(&cache->lock);
-
- if (cache->ro) {
- cache->ro++;
- ret = 0;
- goto out;
- }
-
- num_bytes = cache->key.offset - cache->reserved - cache->pinned -
- cache->bytes_super - btrfs_block_group_used(&cache->item);
- sinfo_used = btrfs_space_info_used(sinfo, true);
-
- if (sinfo_used + num_bytes + min_allocable_bytes <=
- sinfo->total_bytes) {
- sinfo->bytes_readonly += num_bytes;
- cache->ro++;
- list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
- ret = 0;
- }
-out:
- spin_unlock(&cache->lock);
- spin_unlock(&sinfo->lock);
- if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
- btrfs_info(cache->fs_info,
- "unable to make block group %llu ro",
- cache->key.objectid);
- btrfs_info(cache->fs_info,
- "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
- sinfo_used, num_bytes, min_allocable_bytes);
- dump_space_info(cache->fs_info, cache->space_info, 0, 0);
- }
- return ret;
-}
-
-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
-
-{
- struct btrfs_fs_info *fs_info = cache->fs_info;
- struct btrfs_trans_handle *trans;
- u64 alloc_flags;
- int ret;
-
-again:
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- /*
- * we're not allowed to set block groups readonly after the dirty
- * block groups cache has started writing. If it already started,
- * back off and let this transaction commit
- */
- mutex_lock(&fs_info->ro_block_group_mutex);
- if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
- u64 transid = trans->transid;
-
- mutex_unlock(&fs_info->ro_block_group_mutex);
- btrfs_end_transaction(trans);
-
- ret = btrfs_wait_for_commit(fs_info, transid);
- if (ret)
- return ret;
- goto again;
- }
-
- /*
- * if we are changing raid levels, try to allocate a corresponding
- * block group with the new raid level.
- */
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
- if (alloc_flags != cache->flags) {
- ret = do_chunk_alloc(trans, alloc_flags,
- CHUNK_ALLOC_FORCE);
- /*
- * ENOSPC is allowed here, we may have enough space
- * already allocated at the new raid level to
- * carry on
- */
- if (ret == -ENOSPC)
- ret = 0;
- if (ret < 0)
- goto out;
- }
-
- ret = inc_block_group_ro(cache, 0);
- if (!ret)
- goto out;
- alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
- ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
- if (ret < 0)
- goto out;
- ret = inc_block_group_ro(cache, 0);
-out:
- if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
- mutex_lock(&fs_info->chunk_mutex);
- check_system_chunk(trans, alloc_flags);
- mutex_unlock(&fs_info->chunk_mutex);
- }
- mutex_unlock(&fs_info->ro_block_group_mutex);
-
- btrfs_end_transaction(trans);
- return ret;
-}
-
-int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
-{
- u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
-
- return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
-}
-
/*
* helper to account the unused space of all the readonly block group in the
* space_info. takes mirrors into account.
*/
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
u64 free_bytes = 0;
int factor;
@@ -9882,9 +5534,8 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
}
factor = btrfs_bg_type_to_factor(block_group->flags);
- free_bytes += (block_group->key.offset -
- btrfs_block_group_used(&block_group->item)) *
- factor;
+ free_bytes += (block_group->length -
+ block_group->used) * factor;
spin_unlock(&block_group->lock);
}
@@ -9893,1401 +5544,6 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
-void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
-{
- struct btrfs_space_info *sinfo = cache->space_info;
- u64 num_bytes;
-
- BUG_ON(!cache->ro);
-
- spin_lock(&sinfo->lock);
- spin_lock(&cache->lock);
- if (!--cache->ro) {
- num_bytes = cache->key.offset - cache->reserved -
- cache->pinned - cache->bytes_super -
- btrfs_block_group_used(&cache->item);
- sinfo->bytes_readonly -= num_bytes;
- list_del_init(&cache->ro_list);
- }
- spin_unlock(&cache->lock);
- spin_unlock(&sinfo->lock);
-}
-
-/*
- * Checks to see if it's even possible to relocate this block group.
- *
- * @return - -1 if it's not a good idea to relocate this block group, 0 if its
- * ok to go ahead and try.
- */
-int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
- struct btrfs_root *root = fs_info->extent_root;
- struct btrfs_block_group_cache *block_group;
- struct btrfs_space_info *space_info;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- struct btrfs_device *device;
- struct btrfs_trans_handle *trans;
- u64 min_free;
- u64 dev_min = 1;
- u64 dev_nr = 0;
- u64 target;
- int debug;
- int index;
- int full = 0;
- int ret = 0;
-
- debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
-
- block_group = btrfs_lookup_block_group(fs_info, bytenr);
-
- /* odd, couldn't find the block group, leave it alone */
- if (!block_group) {
- if (debug)
- btrfs_warn(fs_info,
- "can't find block group for bytenr %llu",
- bytenr);
- return -1;
- }
-
- min_free = btrfs_block_group_used(&block_group->item);
-
- /* no bytes used, we're good */
- if (!min_free)
- goto out;
-
- space_info = block_group->space_info;
- spin_lock(&space_info->lock);
-
- full = space_info->full;
-
- /*
- * if this is the last block group we have in this space, we can't
- * relocate it unless we're able to allocate a new chunk below.
- *
- * Otherwise, we need to make sure we have room in the space to handle
- * all of the extents from this block group. If we can, we're good
- */
- if ((space_info->total_bytes != block_group->key.offset) &&
- (btrfs_space_info_used(space_info, false) + min_free <
- space_info->total_bytes)) {
- spin_unlock(&space_info->lock);
- goto out;
- }
- spin_unlock(&space_info->lock);
-
- /*
- * ok we don't have enough space, but maybe we have free space on our
- * devices to allocate new chunks for relocation, so loop through our
- * alloc devices and guess if we have enough space. if this block
- * group is going to be restriped, run checks against the target
- * profile instead of the current one.
- */
- ret = -1;
-
- /*
- * index:
- * 0: raid10
- * 1: raid1
- * 2: dup
- * 3: raid0
- * 4: single
- */
- target = get_restripe_target(fs_info, block_group->flags);
- if (target) {
- index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
- } else {
- /*
- * this is just a balance, so if we were marked as full
- * we know there is no space for a new chunk
- */
- if (full) {
- if (debug)
- btrfs_warn(fs_info,
- "no space to alloc new chunk for block group %llu",
- block_group->key.objectid);
- goto out;
- }
-
- index = btrfs_bg_flags_to_raid_index(block_group->flags);
- }
-
- if (index == BTRFS_RAID_RAID10) {
- dev_min = 4;
- /* Divide by 2 */
- min_free >>= 1;
- } else if (index == BTRFS_RAID_RAID1) {
- dev_min = 2;
- } else if (index == BTRFS_RAID_DUP) {
- /* Multiply by 2 */
- min_free <<= 1;
- } else if (index == BTRFS_RAID_RAID0) {
- dev_min = fs_devices->rw_devices;
- min_free = div64_u64(min_free, dev_min);
- }
-
- /* We need to do this so that we can look at pending chunks */
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
-
- mutex_lock(&fs_info->chunk_mutex);
- list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
- u64 dev_offset;
-
- /*
- * check to make sure we can actually find a chunk with enough
- * space to fit our block group in.
- */
- if (device->total_bytes > device->bytes_used + min_free &&
- !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
- ret = find_free_dev_extent(trans, device, min_free,
- &dev_offset, NULL);
- if (!ret)
- dev_nr++;
-
- if (dev_nr >= dev_min)
- break;
-
- ret = -1;
- }
- }
- if (debug && ret == -1)
- btrfs_warn(fs_info,
- "no space to allocate a new chunk for block group %llu",
- block_group->key.objectid);
- mutex_unlock(&fs_info->chunk_mutex);
- btrfs_end_transaction(trans);
-out:
- btrfs_put_block_group(block_group);
- return ret;
-}
-
-static int find_first_block_group(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- struct btrfs_key *key)
-{
- struct btrfs_root *root = fs_info->extent_root;
- int ret = 0;
- struct btrfs_key found_key;
- struct extent_buffer *leaf;
- struct btrfs_block_group_item bg;
- u64 flags;
- int slot;
-
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- slot = path->slots[0];
- leaf = path->nodes[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
- break;
- }
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
- if (found_key.objectid >= key->objectid &&
- found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- struct extent_map_tree *em_tree;
- struct extent_map *em;
-
- em_tree = &root->fs_info->mapping_tree.map_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, found_key.objectid,
- found_key.offset);
- read_unlock(&em_tree->lock);
- if (!em) {
- btrfs_err(fs_info,
- "logical %llu len %llu found bg but no related chunk",
- found_key.objectid, found_key.offset);
- ret = -ENOENT;
- } else if (em->start != found_key.objectid ||
- em->len != found_key.offset) {
- btrfs_err(fs_info,
- "block group %llu len %llu mismatch with chunk %llu len %llu",
- found_key.objectid, found_key.offset,
- em->start, em->len);
- ret = -EUCLEAN;
- } else {
- read_extent_buffer(leaf, &bg,
- btrfs_item_ptr_offset(leaf, slot),
- sizeof(bg));
- flags = btrfs_block_group_flags(&bg) &
- BTRFS_BLOCK_GROUP_TYPE_MASK;
-
- if (flags != (em->map_lookup->type &
- BTRFS_BLOCK_GROUP_TYPE_MASK)) {
- btrfs_err(fs_info,
-"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
- found_key.objectid,
- found_key.offset, flags,
- (BTRFS_BLOCK_GROUP_TYPE_MASK &
- em->map_lookup->type));
- ret = -EUCLEAN;
- } else {
- ret = 0;
- }
- }
- free_extent_map(em);
- goto out;
- }
- path->slots[0]++;
- }
-out:
- return ret;
-}
-
-void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
-{
- struct btrfs_block_group_cache *block_group;
- u64 last = 0;
-
- while (1) {
- struct inode *inode;
-
- block_group = btrfs_lookup_first_block_group(info, last);
- while (block_group) {
- wait_block_group_cache_done(block_group);
- spin_lock(&block_group->lock);
- if (block_group->iref)
- break;
- spin_unlock(&block_group->lock);
- block_group = next_block_group(info, block_group);
- }
- if (!block_group) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
-
- inode = block_group->inode;
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- ASSERT(block_group->io_ctl.inode == NULL);
- iput(inode);
- last = block_group->key.objectid + block_group->key.offset;
- btrfs_put_block_group(block_group);
- }
-}
-
-/*
- * Must be called only after stopping all workers, since we could have block
- * group caching kthreads running, and therefore they could race with us if we
- * freed the block groups before stopping them.
- */
-int btrfs_free_block_groups(struct btrfs_fs_info *info)
-{
- struct btrfs_block_group_cache *block_group;
- struct btrfs_space_info *space_info;
- struct btrfs_caching_control *caching_ctl;
- struct rb_node *n;
-
- down_write(&info->commit_root_sem);
- while (!list_empty(&info->caching_block_groups)) {
- caching_ctl = list_entry(info->caching_block_groups.next,
- struct btrfs_caching_control, list);
- list_del(&caching_ctl->list);
- put_caching_control(caching_ctl);
- }
- up_write(&info->commit_root_sem);
-
- spin_lock(&info->unused_bgs_lock);
- while (!list_empty(&info->unused_bgs)) {
- block_group = list_first_entry(&info->unused_bgs,
- struct btrfs_block_group_cache,
- bg_list);
- list_del_init(&block_group->bg_list);
- btrfs_put_block_group(block_group);
- }
- spin_unlock(&info->unused_bgs_lock);
-
- spin_lock(&info->block_group_cache_lock);
- while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
- block_group = rb_entry(n, struct btrfs_block_group_cache,
- cache_node);
- rb_erase(&block_group->cache_node,
- &info->block_group_cache_tree);
- RB_CLEAR_NODE(&block_group->cache_node);
- spin_unlock(&info->block_group_cache_lock);
-
- down_write(&block_group->space_info->groups_sem);
- list_del(&block_group->list);
- up_write(&block_group->space_info->groups_sem);
-
- /*
- * We haven't cached this block group, which means we could
- * possibly have excluded extents on this block group.
- */
- if (block_group->cached == BTRFS_CACHE_NO ||
- block_group->cached == BTRFS_CACHE_ERROR)
- free_excluded_extents(block_group);
-
- btrfs_remove_free_space_cache(block_group);
- ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
- ASSERT(list_empty(&block_group->dirty_list));
- ASSERT(list_empty(&block_group->io_list));
- ASSERT(list_empty(&block_group->bg_list));
- ASSERT(atomic_read(&block_group->count) == 1);
- btrfs_put_block_group(block_group);
-
- spin_lock(&info->block_group_cache_lock);
- }
- spin_unlock(&info->block_group_cache_lock);
-
- /* now that all the block groups are freed, go through and
- * free all the space_info structs. This is only called during
- * the final stages of unmount, and so we know nobody is
- * using them. We call synchronize_rcu() once before we start,
- * just to be on the safe side.
- */
- synchronize_rcu();
-
- release_global_block_rsv(info);
-
- while (!list_empty(&info->space_info)) {
- int i;
-
- space_info = list_entry(info->space_info.next,
- struct btrfs_space_info,
- list);
-
- /*
- * Do not hide this behind enospc_debug, this is actually
- * important and indicates a real bug if this happens.
- */
- if (WARN_ON(space_info->bytes_pinned > 0 ||
- space_info->bytes_reserved > 0 ||
- space_info->bytes_may_use > 0))
- dump_space_info(info, space_info, 0, 0);
- list_del(&space_info->list);
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
- struct kobject *kobj;
- kobj = space_info->block_group_kobjs[i];
- space_info->block_group_kobjs[i] = NULL;
- if (kobj) {
- kobject_del(kobj);
- kobject_put(kobj);
- }
- }
- kobject_del(&space_info->kobj);
- kobject_put(&space_info->kobj);
- }
- return 0;
-}
-
-/* link_block_group will queue up kobjects to add when we're reclaim-safe */
-void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_space_info *space_info;
- struct raid_kobject *rkobj;
- LIST_HEAD(list);
- int index;
- int ret = 0;
-
- spin_lock(&fs_info->pending_raid_kobjs_lock);
- list_splice_init(&fs_info->pending_raid_kobjs, &list);
- spin_unlock(&fs_info->pending_raid_kobjs_lock);
-
- list_for_each_entry(rkobj, &list, list) {
- space_info = __find_space_info(fs_info, rkobj->flags);
- index = btrfs_bg_flags_to_raid_index(rkobj->flags);
-
- ret = kobject_add(&rkobj->kobj, &space_info->kobj,
- "%s", get_raid_name(index));
- if (ret) {
- kobject_put(&rkobj->kobj);
- break;
- }
- }
- if (ret)
- btrfs_warn(fs_info,
- "failed to add kobject for block cache, ignoring");
-}
-
-static void link_block_group(struct btrfs_block_group_cache *cache)
-{
- struct btrfs_space_info *space_info = cache->space_info;
- struct btrfs_fs_info *fs_info = cache->fs_info;
- int index = btrfs_bg_flags_to_raid_index(cache->flags);
- bool first = false;
-
- down_write(&space_info->groups_sem);
- if (list_empty(&space_info->block_groups[index]))
- first = true;
- list_add_tail(&cache->list, &space_info->block_groups[index]);
- up_write(&space_info->groups_sem);
-
- if (first) {
- struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
- if (!rkobj) {
- btrfs_warn(cache->fs_info,
- "couldn't alloc memory for raid level kobject");
- return;
- }
- rkobj->flags = cache->flags;
- kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
-
- spin_lock(&fs_info->pending_raid_kobjs_lock);
- list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
- spin_unlock(&fs_info->pending_raid_kobjs_lock);
- space_info->block_group_kobjs[index] = &rkobj->kobj;
- }
-}
-
-static struct btrfs_block_group_cache *
-btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
- u64 start, u64 size)
-{
- struct btrfs_block_group_cache *cache;
-
- cache = kzalloc(sizeof(*cache), GFP_NOFS);
- if (!cache)
- return NULL;
-
- cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
- GFP_NOFS);
- if (!cache->free_space_ctl) {
- kfree(cache);
- return NULL;
- }
-
- cache->key.objectid = start;
- cache->key.offset = size;
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-
- cache->fs_info = fs_info;
- cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
- set_free_space_tree_thresholds(cache);
-
- atomic_set(&cache->count, 1);
- spin_lock_init(&cache->lock);
- init_rwsem(&cache->data_rwsem);
- INIT_LIST_HEAD(&cache->list);
- INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->bg_list);
- INIT_LIST_HEAD(&cache->ro_list);
- INIT_LIST_HEAD(&cache->dirty_list);
- INIT_LIST_HEAD(&cache->io_list);
- btrfs_init_free_space_ctl(cache);
- atomic_set(&cache->trimming, 0);
- mutex_init(&cache->free_space_lock);
- btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
-
- return cache;
-}
-
-
-/*
- * Iterate all chunks and verify that each of them has the corresponding block
- * group
- */
-static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct btrfs_block_group_cache *bg;
- u64 start = 0;
- int ret = 0;
-
- while (1) {
- read_lock(&map_tree->map_tree.lock);
- /*
- * lookup_extent_mapping will return the first extent map
- * intersecting the range, so setting @len to 1 is enough to
- * get the first chunk.
- */
- em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
- read_unlock(&map_tree->map_tree.lock);
- if (!em)
- break;
-
- bg = btrfs_lookup_block_group(fs_info, em->start);
- if (!bg) {
- btrfs_err(fs_info,
- "chunk start=%llu len=%llu doesn't have corresponding block group",
- em->start, em->len);
- ret = -EUCLEAN;
- free_extent_map(em);
- break;
- }
- if (bg->key.objectid != em->start ||
- bg->key.offset != em->len ||
- (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
- (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
- btrfs_err(fs_info,
-"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
- em->start, em->len,
- em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
- bg->key.objectid, bg->key.offset,
- bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
- ret = -EUCLEAN;
- free_extent_map(em);
- btrfs_put_block_group(bg);
- break;
- }
- start = em->start + em->len;
- free_extent_map(em);
- btrfs_put_block_group(bg);
- }
- return ret;
-}
-
-int btrfs_read_block_groups(struct btrfs_fs_info *info)
-{
- struct btrfs_path *path;
- int ret;
- struct btrfs_block_group_cache *cache;
- struct btrfs_space_info *space_info;
- struct btrfs_key key;
- struct btrfs_key found_key;
- struct extent_buffer *leaf;
- int need_clear = 0;
- u64 cache_gen;
- u64 feature;
- int mixed;
-
- feature = btrfs_super_incompat_flags(info->super_copy);
- mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
-
- key.objectid = 0;
- key.offset = 0;
- key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = READA_FORWARD;
-
- cache_gen = btrfs_super_cache_generation(info->super_copy);
- if (btrfs_test_opt(info, SPACE_CACHE) &&
- btrfs_super_generation(info->super_copy) != cache_gen)
- need_clear = 1;
- if (btrfs_test_opt(info, CLEAR_CACHE))
- need_clear = 1;
-
- while (1) {
- ret = find_first_block_group(info, path, &key);
- if (ret > 0)
- break;
- if (ret != 0)
- goto error;
-
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
- cache = btrfs_create_block_group_cache(info, found_key.objectid,
- found_key.offset);
- if (!cache) {
- ret = -ENOMEM;
- goto error;
- }
-
- if (need_clear) {
- /*
- * When we mount with old space cache, we need to
- * set BTRFS_DC_CLEAR and set dirty flag.
- *
- * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
- * truncate the old free space cache inode and
- * setup a new one.
- * b) Setting 'dirty flag' makes sure that we flush
- * the new space cache info onto disk.
- */
- if (btrfs_test_opt(info, SPACE_CACHE))
- cache->disk_cache_state = BTRFS_DC_CLEAR;
- }
-
- read_extent_buffer(leaf, &cache->item,
- btrfs_item_ptr_offset(leaf, path->slots[0]),
- sizeof(cache->item));
- cache->flags = btrfs_block_group_flags(&cache->item);
- if (!mixed &&
- ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
- (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
- btrfs_err(info,
-"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
- cache->key.objectid);
- ret = -EINVAL;
- goto error;
- }
-
- key.objectid = found_key.objectid + found_key.offset;
- btrfs_release_path(path);
-
- /*
- * We need to exclude the super stripes now so that the space
- * info has super bytes accounted for, otherwise we'll think
- * we have more space than we actually do.
- */
- ret = exclude_super_stripes(cache);
- if (ret) {
- /*
- * We may have excluded something, so call this just in
- * case.
- */
- free_excluded_extents(cache);
- btrfs_put_block_group(cache);
- goto error;
- }
-
- /*
- * check for two cases, either we are full, and therefore
- * don't need to bother with the caching work since we won't
- * find any space, or we are empty, and we can just add all
- * the space in and be done with it. This saves us _a_lot_ of
- * time, particularly in the full case.
- */
- if (found_key.offset == btrfs_block_group_used(&cache->item)) {
- cache->last_byte_to_unpin = (u64)-1;
- cache->cached = BTRFS_CACHE_FINISHED;
- free_excluded_extents(cache);
- } else if (btrfs_block_group_used(&cache->item) == 0) {
- cache->last_byte_to_unpin = (u64)-1;
- cache->cached = BTRFS_CACHE_FINISHED;
- add_new_free_space(cache, found_key.objectid,
- found_key.objectid +
- found_key.offset);
- free_excluded_extents(cache);
- }
-
- ret = btrfs_add_block_group_cache(info, cache);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- btrfs_put_block_group(cache);
- goto error;
- }
-
- trace_btrfs_add_block_group(info, cache, 0);
- update_space_info(info, cache->flags, found_key.offset,
- btrfs_block_group_used(&cache->item),
- cache->bytes_super, &space_info);
-
- cache->space_info = space_info;
-
- link_block_group(cache);
-
- set_avail_alloc_bits(info, cache->flags);
- if (btrfs_chunk_readonly(info, cache->key.objectid)) {
- inc_block_group_ro(cache, 1);
- } else if (btrfs_block_group_used(&cache->item) == 0) {
- ASSERT(list_empty(&cache->bg_list));
- btrfs_mark_bg_unused(cache);
- }
- }
-
- list_for_each_entry_rcu(space_info, &info->space_info, list) {
- if (!(get_alloc_profile(info, space_info->flags) &
- (BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6 |
- BTRFS_BLOCK_GROUP_DUP)))
- continue;
- /*
- * avoid allocating from un-mirrored block group if there are
- * mirrored block groups.
- */
- list_for_each_entry(cache,
- &space_info->block_groups[BTRFS_RAID_RAID0],
- list)
- inc_block_group_ro(cache, 1);
- list_for_each_entry(cache,
- &space_info->block_groups[BTRFS_RAID_SINGLE],
- list)
- inc_block_group_ro(cache, 1);
- }
-
- btrfs_add_raid_kobjects(info);
- init_global_block_rsv(info);
- ret = check_chunk_block_group_mappings(info);
-error:
- btrfs_free_path(path);
- return ret;
-}
-
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group;
- struct btrfs_root *extent_root = fs_info->extent_root;
- struct btrfs_block_group_item item;
- struct btrfs_key key;
- int ret = 0;
-
- if (!trans->can_flush_pending_bgs)
- return;
-
- while (!list_empty(&trans->new_bgs)) {
- block_group = list_first_entry(&trans->new_bgs,
- struct btrfs_block_group_cache,
- bg_list);
- if (ret)
- goto next;
-
- spin_lock(&block_group->lock);
- memcpy(&item, &block_group->item, sizeof(item));
- memcpy(&key, &block_group->key, sizeof(key));
- spin_unlock(&block_group->lock);
-
- ret = btrfs_insert_item(trans, extent_root, &key, &item,
- sizeof(item));
- if (ret)
- btrfs_abort_transaction(trans, ret);
- ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
- if (ret)
- btrfs_abort_transaction(trans, ret);
- add_block_group_free_space(trans, block_group);
- /* already aborted the transaction if it failed. */
-next:
- btrfs_delayed_refs_rsv_release(fs_info, 1);
- list_del_init(&block_group->bg_list);
- }
- btrfs_trans_release_chunk_metadata(trans);
-}
-
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
- u64 type, u64 chunk_offset, u64 size)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *cache;
- int ret;
-
- btrfs_set_log_full_commit(fs_info, trans);
-
- cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
- if (!cache)
- return -ENOMEM;
-
- btrfs_set_block_group_used(&cache->item, bytes_used);
- btrfs_set_block_group_chunk_objectid(&cache->item,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
- btrfs_set_block_group_flags(&cache->item, type);
-
- cache->flags = type;
- cache->last_byte_to_unpin = (u64)-1;
- cache->cached = BTRFS_CACHE_FINISHED;
- cache->needs_free_space = 1;
- ret = exclude_super_stripes(cache);
- if (ret) {
- /*
- * We may have excluded something, so call this just in
- * case.
- */
- free_excluded_extents(cache);
- btrfs_put_block_group(cache);
- return ret;
- }
-
- add_new_free_space(cache, chunk_offset, chunk_offset + size);
-
- free_excluded_extents(cache);
-
-#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_should_fragment_free_space(cache)) {
- u64 new_bytes_used = size - bytes_used;
-
- bytes_used += new_bytes_used >> 1;
- fragment_free_space(cache);
- }
-#endif
- /*
- * Ensure the corresponding space_info object is created and
- * assigned to our block group. We want our bg to be added to the rbtree
- * with its ->space_info set.
- */
- cache->space_info = __find_space_info(fs_info, cache->flags);
- ASSERT(cache->space_info);
-
- ret = btrfs_add_block_group_cache(fs_info, cache);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- btrfs_put_block_group(cache);
- return ret;
- }
-
- /*
- * Now that our block group has its ->space_info set and is inserted in
- * the rbtree, update the space info's counters.
- */
- trace_btrfs_add_block_group(fs_info, cache, 1);
- update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, &cache->space_info);
- update_global_block_rsv(fs_info);
-
- link_block_group(cache);
-
- list_add_tail(&cache->bg_list, &trans->new_bgs);
- trans->delayed_ref_updates++;
- btrfs_update_delayed_refs_rsv(trans);
-
- set_avail_alloc_bits(fs_info, type);
- return 0;
-}
-
-static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
-{
- u64 extra_flags = chunk_to_extended(flags) &
- BTRFS_EXTENDED_PROFILE_MASK;
-
- write_seqlock(&fs_info->profiles_lock);
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- fs_info->avail_data_alloc_bits &= ~extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_METADATA)
- fs_info->avail_metadata_alloc_bits &= ~extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- fs_info->avail_system_alloc_bits &= ~extra_flags;
- write_sequnlock(&fs_info->profiles_lock);
-}
-
-int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
- struct btrfs_path *path;
- struct btrfs_block_group_cache *block_group;
- struct btrfs_free_cluster *cluster;
- struct btrfs_root *tree_root = fs_info->tree_root;
- struct btrfs_key key;
- struct inode *inode;
- struct kobject *kobj = NULL;
- int ret;
- int index;
- int factor;
- struct btrfs_caching_control *caching_ctl = NULL;
- bool remove_em;
- bool remove_rsv = false;
-
- block_group = btrfs_lookup_block_group(fs_info, group_start);
- BUG_ON(!block_group);
- BUG_ON(!block_group->ro);
-
- trace_btrfs_remove_block_group(block_group);
- /*
- * Free the reserved super bytes from this block group before
- * remove it.
- */
- free_excluded_extents(block_group);
- btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
- block_group->key.offset);
-
- memcpy(&key, &block_group->key, sizeof(key));
- index = btrfs_bg_flags_to_raid_index(block_group->flags);
- factor = btrfs_bg_type_to_factor(block_group->flags);
-
- /* make sure this block group isn't part of an allocation cluster */
- cluster = &fs_info->data_alloc_cluster;
- spin_lock(&cluster->refill_lock);
- btrfs_return_cluster_to_free_space(block_group, cluster);
- spin_unlock(&cluster->refill_lock);
-
- /*
- * make sure this block group isn't part of a metadata
- * allocation cluster
- */
- cluster = &fs_info->meta_alloc_cluster;
- spin_lock(&cluster->refill_lock);
- btrfs_return_cluster_to_free_space(block_group, cluster);
- spin_unlock(&cluster->refill_lock);
-
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- /*
- * get the inode first so any iput calls done for the io_list
- * aren't the final iput (no unlinks allowed now)
- */
- inode = lookup_free_space_inode(fs_info, block_group, path);
-
- mutex_lock(&trans->transaction->cache_write_mutex);
- /*
- * Make sure our free space cache IO is done before removing the
- * free space inode
- */
- spin_lock(&trans->transaction->dirty_bgs_lock);
- if (!list_empty(&block_group->io_list)) {
- list_del_init(&block_group->io_list);
-
- WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
-
- spin_unlock(&trans->transaction->dirty_bgs_lock);
- btrfs_wait_cache_io(trans, block_group, path);
- btrfs_put_block_group(block_group);
- spin_lock(&trans->transaction->dirty_bgs_lock);
- }
-
- if (!list_empty(&block_group->dirty_list)) {
- list_del_init(&block_group->dirty_list);
- remove_rsv = true;
- btrfs_put_block_group(block_group);
- }
- spin_unlock(&trans->transaction->dirty_bgs_lock);
- mutex_unlock(&trans->transaction->cache_write_mutex);
-
- if (!IS_ERR(inode)) {
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (ret) {
- btrfs_add_delayed_iput(inode);
- goto out;
- }
- clear_nlink(inode);
- /* One for the block groups ref */
- spin_lock(&block_group->lock);
- if (block_group->iref) {
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- iput(inode);
- } else {
- spin_unlock(&block_group->lock);
- }
- /* One for our lookup ref */
- btrfs_add_delayed_iput(inode);
- }
-
- key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = block_group->key.objectid;
- key.type = 0;
-
- ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- if (ret < 0)
- goto out;
- if (ret > 0)
- btrfs_release_path(path);
- if (ret == 0) {
- ret = btrfs_del_item(trans, tree_root, path);
- if (ret)
- goto out;
- btrfs_release_path(path);
- }
-
- spin_lock(&fs_info->block_group_cache_lock);
- rb_erase(&block_group->cache_node,
- &fs_info->block_group_cache_tree);
- RB_CLEAR_NODE(&block_group->cache_node);
-
- if (fs_info->first_logical_byte == block_group->key.objectid)
- fs_info->first_logical_byte = (u64)-1;
- spin_unlock(&fs_info->block_group_cache_lock);
-
- down_write(&block_group->space_info->groups_sem);
- /*
- * we must use list_del_init so people can check to see if they
- * are still on the list after taking the semaphore
- */
- list_del_init(&block_group->list);
- if (list_empty(&block_group->space_info->block_groups[index])) {
- kobj = block_group->space_info->block_group_kobjs[index];
- block_group->space_info->block_group_kobjs[index] = NULL;
- clear_avail_alloc_bits(fs_info, block_group->flags);
- }
- up_write(&block_group->space_info->groups_sem);
- if (kobj) {
- kobject_del(kobj);
- kobject_put(kobj);
- }
-
- if (block_group->has_caching_ctl)
- caching_ctl = get_caching_control(block_group);
- if (block_group->cached == BTRFS_CACHE_STARTED)
- wait_block_group_cache_done(block_group);
- if (block_group->has_caching_ctl) {
- down_write(&fs_info->commit_root_sem);
- if (!caching_ctl) {
- struct btrfs_caching_control *ctl;
-
- list_for_each_entry(ctl,
- &fs_info->caching_block_groups, list)
- if (ctl->block_group == block_group) {
- caching_ctl = ctl;
- refcount_inc(&caching_ctl->count);
- break;
- }
- }
- if (caching_ctl)
- list_del_init(&caching_ctl->list);
- up_write(&fs_info->commit_root_sem);
- if (caching_ctl) {
- /* Once for the caching bgs list and once for us. */
- put_caching_control(caching_ctl);
- put_caching_control(caching_ctl);
- }
- }
-
- spin_lock(&trans->transaction->dirty_bgs_lock);
- WARN_ON(!list_empty(&block_group->dirty_list));
- WARN_ON(!list_empty(&block_group->io_list));
- spin_unlock(&trans->transaction->dirty_bgs_lock);
-
- btrfs_remove_free_space_cache(block_group);
-
- spin_lock(&block_group->space_info->lock);
- list_del_init(&block_group->ro_list);
-
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
- WARN_ON(block_group->space_info->total_bytes
- < block_group->key.offset);
- WARN_ON(block_group->space_info->bytes_readonly
- < block_group->key.offset);
- WARN_ON(block_group->space_info->disk_total
- < block_group->key.offset * factor);
- }
- block_group->space_info->total_bytes -= block_group->key.offset;
- block_group->space_info->bytes_readonly -= block_group->key.offset;
- block_group->space_info->disk_total -= block_group->key.offset * factor;
-
- spin_unlock(&block_group->space_info->lock);
-
- memcpy(&key, &block_group->key, sizeof(key));
-
- mutex_lock(&fs_info->chunk_mutex);
- if (!list_empty(&em->list)) {
- /* We're in the transaction->pending_chunks list. */
- free_extent_map(em);
- }
- spin_lock(&block_group->lock);
- block_group->removed = 1;
- /*
- * At this point trimming can't start on this block group, because we
- * removed the block group from the tree fs_info->block_group_cache_tree
- * so no one can't find it anymore and even if someone already got this
- * block group before we removed it from the rbtree, they have already
- * incremented block_group->trimming - if they didn't, they won't find
- * any free space entries because we already removed them all when we
- * called btrfs_remove_free_space_cache().
- *
- * And we must not remove the extent map from the fs_info->mapping_tree
- * to prevent the same logical address range and physical device space
- * ranges from being reused for a new block group. This is because our
- * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
- * completely transactionless, so while it is trimming a range the
- * currently running transaction might finish and a new one start,
- * allowing for new block groups to be created that can reuse the same
- * physical device locations unless we take this special care.
- *
- * There may also be an implicit trim operation if the file system
- * is mounted with -odiscard. The same protections must remain
- * in place until the extents have been discarded completely when
- * the transaction commit has completed.
- */
- remove_em = (atomic_read(&block_group->trimming) == 0);
- /*
- * Make sure a trimmer task always sees the em in the pinned_chunks list
- * if it sees block_group->removed == 1 (needs to lock block_group->lock
- * before checking block_group->removed).
- */
- if (!remove_em) {
- /*
- * Our em might be in trans->transaction->pending_chunks which
- * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
- * and so is the fs_info->pinned_chunks list.
- *
- * So at this point we must be holding the chunk_mutex to avoid
- * any races with chunk allocation (more specifically at
- * volumes.c:contains_pending_extent()), to ensure it always
- * sees the em, either in the pending_chunks list or in the
- * pinned_chunks list.
- */
- list_move_tail(&em->list, &fs_info->pinned_chunks);
- }
- spin_unlock(&block_group->lock);
-
- if (remove_em) {
- struct extent_map_tree *em_tree;
-
- em_tree = &fs_info->mapping_tree.map_tree;
- write_lock(&em_tree->lock);
- /*
- * The em might be in the pending_chunks list, so make sure the
- * chunk mutex is locked, since remove_extent_mapping() will
- * delete us from that list.
- */
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- /* once for the tree */
- free_extent_map(em);
- }
-
- mutex_unlock(&fs_info->chunk_mutex);
-
- ret = remove_block_group_free_space(trans, block_group);
- if (ret)
- goto out;
-
- btrfs_put_block_group(block_group);
- btrfs_put_block_group(block_group);
-
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret > 0)
- ret = -EIO;
- if (ret < 0)
- goto out;
-
- ret = btrfs_del_item(trans, root, path);
-out:
- if (remove_rsv)
- btrfs_delayed_refs_rsv_release(fs_info, 1);
- btrfs_free_path(path);
- return ret;
-}
-
-struct btrfs_trans_handle *
-btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
- const u64 chunk_offset)
-{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
- struct extent_map *em;
- struct map_lookup *map;
- unsigned int num_items;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
- ASSERT(em && em->start == chunk_offset);
-
- /*
- * We need to reserve 3 + N units from the metadata space info in order
- * to remove a block group (done at btrfs_remove_chunk() and at
- * btrfs_remove_block_group()), which are used for:
- *
- * 1 unit for adding the free space inode's orphan (located in the tree
- * of tree roots).
- * 1 unit for deleting the block group item (located in the extent
- * tree).
- * 1 unit for deleting the free space item (located in tree of tree
- * roots).
- * N units for deleting N device extent items corresponding to each
- * stripe (located in the device tree).
- *
- * In order to remove a block group we also need to reserve units in the
- * system space info in order to update the chunk tree (update one or
- * more device items and remove one chunk item), but this is done at
- * btrfs_remove_chunk() through a call to check_system_chunk().
- */
- map = em->map_lookup;
- num_items = 3 + map->num_stripes;
- free_extent_map(em);
-
- return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
- num_items, 1);
-}
-
-/*
- * Process the unused_bgs list and remove any that don't have any allocated
- * space inside of them.
- */
-void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_block_group_cache *block_group;
- struct btrfs_space_info *space_info;
- struct btrfs_trans_handle *trans;
- int ret = 0;
-
- if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
- return;
-
- spin_lock(&fs_info->unused_bgs_lock);
- while (!list_empty(&fs_info->unused_bgs)) {
- u64 start, end;
- int trimming;
-
- block_group = list_first_entry(&fs_info->unused_bgs,
- struct btrfs_block_group_cache,
- bg_list);
- list_del_init(&block_group->bg_list);
-
- space_info = block_group->space_info;
-
- if (ret || btrfs_mixed_space_info(space_info)) {
- btrfs_put_block_group(block_group);
- continue;
- }
- spin_unlock(&fs_info->unused_bgs_lock);
-
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
-
- /* Don't want to race with allocators so take the groups_sem */
- down_write(&space_info->groups_sem);
- spin_lock(&block_group->lock);
- if (block_group->reserved || block_group->pinned ||
- btrfs_block_group_used(&block_group->item) ||
- block_group->ro ||
- list_is_singular(&block_group->list)) {
- /*
- * We want to bail if we made new allocations or have
- * outstanding allocations in this block group. We do
- * the ro check in case balance is currently acting on
- * this block group.
- */
- trace_btrfs_skip_unused_block_group(block_group);
- spin_unlock(&block_group->lock);
- up_write(&space_info->groups_sem);
- goto next;
- }
- spin_unlock(&block_group->lock);
-
- /* We don't want to force the issue, only flip if it's ok. */
- ret = inc_block_group_ro(block_group, 0);
- up_write(&space_info->groups_sem);
- if (ret < 0) {
- ret = 0;
- goto next;
- }
-
- /*
- * Want to do this before we do anything else so we can recover
- * properly if we fail to join the transaction.
- */
- trans = btrfs_start_trans_remove_block_group(fs_info,
- block_group->key.objectid);
- if (IS_ERR(trans)) {
- btrfs_dec_block_group_ro(block_group);
- ret = PTR_ERR(trans);
- goto next;
- }
-
- /*
- * We could have pending pinned extents for this block group,
- * just delete them, we don't care about them anymore.
- */
- start = block_group->key.objectid;
- end = start + block_group->key.offset - 1;
- /*
- * Hold the unused_bg_unpin_mutex lock to avoid racing with
- * btrfs_finish_extent_commit(). If we are at transaction N,
- * another task might be running finish_extent_commit() for the
- * previous transaction N - 1, and have seen a range belonging
- * to the block group in freed_extents[] before we were able to
- * clear the whole block group range from freed_extents[]. This
- * means that task can lookup for the block group after we
- * unpinned it from freed_extents[] and removed it, leading to
- * a BUG_ON() at btrfs_unpin_extent_range().
- */
- mutex_lock(&fs_info->unused_bg_unpin_mutex);
- ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
- EXTENT_DIRTY);
- if (ret) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_dec_block_group_ro(block_group);
- goto end_trans;
- }
- ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
- EXTENT_DIRTY);
- if (ret) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_dec_block_group_ro(block_group);
- goto end_trans;
- }
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-
- /* Reset pinned so btrfs_put_block_group doesn't complain */
- spin_lock(&space_info->lock);
- spin_lock(&block_group->lock);
-
- update_bytes_pinned(space_info, -block_group->pinned);
- space_info->bytes_readonly += block_group->pinned;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -block_group->pinned,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- block_group->pinned = 0;
-
- spin_unlock(&block_group->lock);
- spin_unlock(&space_info->lock);
-
- /* DISCARD can flip during remount */
- trimming = btrfs_test_opt(fs_info, DISCARD);
-
- /* Implicit trim during transaction commit. */
- if (trimming)
- btrfs_get_block_group_trimming(block_group);
-
- /*
- * Btrfs_remove_chunk will abort the transaction if things go
- * horribly wrong.
- */
- ret = btrfs_remove_chunk(trans, block_group->key.objectid);
-
- if (ret) {
- if (trimming)
- btrfs_put_block_group_trimming(block_group);
- goto end_trans;
- }
-
- /*
- * If we're not mounted with -odiscard, we can just forget
- * about this block group. Otherwise we'll need to wait
- * until transaction commit to do the actual discard.
- */
- if (trimming) {
- spin_lock(&fs_info->unused_bgs_lock);
- /*
- * A concurrent scrub might have added us to the list
- * fs_info->unused_bgs, so use a list_move operation
- * to add the block group to the deleted_bgs list.
- */
- list_move(&block_group->bg_list,
- &trans->transaction->deleted_bgs);
- spin_unlock(&fs_info->unused_bgs_lock);
- btrfs_get_block_group(block_group);
- }
-end_trans:
- btrfs_end_transaction(trans);
-next:
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- btrfs_put_block_group(block_group);
- spin_lock(&fs_info->unused_bgs_lock);
- }
- spin_unlock(&fs_info->unused_bgs_lock);
-}
-
-int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_super_block *disk_super;
- u64 features;
- u64 flags;
- int mixed = 0;
- int ret;
-
- disk_super = fs_info->super_copy;
- if (!btrfs_super_root(disk_super))
- return -EINVAL;
-
- features = btrfs_super_incompat_flags(disk_super);
- if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
- mixed = 1;
-
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
- ret = create_space_info(fs_info, flags);
- if (ret)
- goto out;
-
- if (mixed) {
- flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
- ret = create_space_info(fs_info, flags);
- } else {
- flags = BTRFS_BLOCK_GROUP_METADATA;
- ret = create_space_info(fs_info, flags);
- if (ret)
- goto out;
-
- flags = BTRFS_BLOCK_GROUP_DATA;
- ret = create_space_info(fs_info, flags);
- }
-out:
- return ret;
-}
-
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end)
{
@@ -11314,10 +5570,9 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
* it while performing the free space search since we have already
* held back allocations.
*/
-static int btrfs_trim_free_extents(struct btrfs_device *device,
- u64 minlen, u64 *trimmed)
+static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
{
- u64 start = 0, len = 0;
+ u64 start = SZ_1M, len = 0, end = 0;
int ret;
*trimmed = 0;
@@ -11338,43 +5593,41 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
while (1) {
struct btrfs_fs_info *fs_info = device->fs_info;
- struct btrfs_transaction *trans;
u64 bytes;
ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
if (ret)
break;
- ret = down_read_killable(&fs_info->commit_root_sem);
- if (ret) {
- mutex_unlock(&fs_info->chunk_mutex);
- break;
- }
+ find_first_clear_extent_bit(&device->alloc_state, start,
+ &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
- spin_lock(&fs_info->trans_lock);
- trans = fs_info->running_transaction;
- if (trans)
- refcount_inc(&trans->use_count);
- spin_unlock(&fs_info->trans_lock);
+ /* Ensure we skip the reserved area in the first 1M */
+ start = max_t(u64, start, SZ_1M);
- if (!trans)
- up_read(&fs_info->commit_root_sem);
+ /*
+ * If find_first_clear_extent_bit find a range that spans the
+ * end of the device it will set end to -1, in this case it's up
+ * to the caller to trim the value to the size of the device.
+ */
+ end = min(end, device->total_bytes - 1);
- ret = find_free_dev_extent_start(trans, device, minlen, start,
- &start, &len);
- if (trans) {
- up_read(&fs_info->commit_root_sem);
- btrfs_put_transaction(trans);
- }
+ len = end - start + 1;
- if (ret) {
+ /* We didn't find any extents */
+ if (!len) {
mutex_unlock(&fs_info->chunk_mutex);
- if (ret == -ENOSPC)
- ret = 0;
+ ret = 0;
break;
}
- ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+ ret = btrfs_issue_discard(device->bdev, start, len,
+ &bytes);
+ if (!ret)
+ set_extent_bits(&device->alloc_state, start,
+ start + bytes - 1,
+ CHUNK_TRIMMED);
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
@@ -11405,10 +5658,11 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
*/
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
struct btrfs_device *device;
struct list_head *devices;
u64 group_trimmed;
+ u64 range_end = U64_MAX;
u64 start;
u64 end;
u64 trimmed = 0;
@@ -11418,26 +5672,33 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
int dev_ret = 0;
int ret = 0;
+ /*
+ * Check range overflow if range->len is set.
+ * The default range->len is U64_MAX.
+ */
+ if (range->len != U64_MAX &&
+ check_add_overflow(range->start, range->len, &range_end))
+ return -EINVAL;
+
cache = btrfs_lookup_first_block_group(fs_info, range->start);
- for (; cache; cache = next_block_group(fs_info, cache)) {
- if (cache->key.objectid >= (range->start + range->len)) {
+ for (; cache; cache = btrfs_next_block_group(cache)) {
+ if (cache->start >= range_end) {
btrfs_put_block_group(cache);
break;
}
- start = max(range->start, cache->key.objectid);
- end = min(range->start + range->len,
- cache->key.objectid + cache->key.offset);
+ start = max(range->start, cache->start);
+ end = min(range_end, cache->start + cache->length);
if (end - start >= range->minlen) {
- if (!block_group_cache_done(cache)) {
- ret = cache_block_group(cache, 0);
+ if (!btrfs_block_group_done(cache)) {
+ ret = btrfs_cache_block_group(cache, 0);
if (ret) {
bg_failed++;
bg_ret = ret;
continue;
}
- ret = wait_block_group_cache_done(cache);
+ ret = btrfs_wait_block_group_cache_done(cache);
if (ret) {
bg_failed++;
bg_ret = ret;
@@ -11466,8 +5727,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
mutex_lock(&fs_info->fs_devices->device_list_mutex);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
- ret = btrfs_trim_free_extents(device, range->minlen,
- &group_trimmed);
+ ret = btrfs_trim_free_extents(device, &group_trimmed);
if (ret) {
dev_failed++;
dev_ret = ret;
@@ -11531,16 +5791,3 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
!atomic_read(&root->will_be_snapshotted));
}
}
-
-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
-{
- struct btrfs_fs_info *fs_info = bg->fs_info;
-
- spin_lock(&fs_info->unused_bgs_lock);
- if (list_empty(&bg->bg_list)) {
- btrfs_get_block_group(bg);
- trace_btrfs_add_unused_block_group(bg);
- list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
- }
- spin_unlock(&fs_info->unused_bgs_lock);
-}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ca8b8e785cf3..2f4802f405a2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -14,6 +14,7 @@
#include <linux/prefetch.h>
#include <linux/cleancache.h>
#include "extent_io.h"
+#include "extent-io-tree.h"
#include "extent_map.h"
#include "ctree.h"
#include "btrfs_inode.h"
@@ -59,12 +60,23 @@ void btrfs_leak_debug_del(struct list_head *entry)
spin_unlock_irqrestore(&leak_lock, flags);
}
-static inline
-void btrfs_leak_debug_check(void)
+static inline void btrfs_extent_buffer_leak_debug_check(void)
{
- struct extent_state *state;
struct extent_buffer *eb;
+ while (!list_empty(&buffers)) {
+ eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+ pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
+ eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
+ list_del(&eb->leak_list);
+ kmem_cache_free(extent_buffer_cache, eb);
+ }
+}
+
+static inline void btrfs_extent_state_leak_debug_check(void)
+{
+ struct extent_state *state;
+
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
@@ -74,14 +86,6 @@ void btrfs_leak_debug_check(void)
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
}
-
- while (!list_empty(&buffers)) {
- eb = list_entry(buffers.next, struct extent_buffer, leak_list);
- pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
- eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
- list_del(&eb->leak_list);
- kmem_cache_free(extent_buffer_cache, eb);
- }
}
#define btrfs_debug_check_extent_io_range(tree, start, end) \
@@ -105,12 +109,11 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
#else
#define btrfs_leak_debug_add(new, head) do {} while (0)
#define btrfs_leak_debug_del(entry) do {} while (0)
-#define btrfs_leak_debug_check() do {} while (0)
+#define btrfs_extent_buffer_leak_debug_check() do {} while (0)
+#define btrfs_extent_state_leak_debug_check() do {} while (0)
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
-#define BUFFER_LRU_MAX 64
-
struct tree_entry {
u64 start;
u64 end;
@@ -151,49 +154,70 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags)
{
blk_status_t ret = 0;
- struct bio_vec *bvec = bio_last_bvec_all(bio);
- struct bio_vec bv;
struct extent_io_tree *tree = bio->bi_private;
- u64 start;
-
- mp_bvec_last_segment(bvec, &bv);
- start = page_offset(bv.bv_page) + bv.bv_offset;
bio->bi_private = NULL;
if (tree->ops)
ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags, start);
+ mirror_num, bio_flags);
else
btrfsic_submit_bio(bio);
return blk_status_to_errno(ret);
}
-static void flush_write_bio(struct extent_page_data *epd)
+/* Cleanup unsubmitted bios */
+static void end_write_bio(struct extent_page_data *epd, int ret)
{
if (epd->bio) {
- int ret;
+ epd->bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(epd->bio);
+ epd->bio = NULL;
+ }
+}
+
+/*
+ * Submit bio from extent page data via submit_one_bio
+ *
+ * Return 0 if everything is OK.
+ * Return <0 for error.
+ */
+static int __must_check flush_write_bio(struct extent_page_data *epd)
+{
+ int ret = 0;
+ if (epd->bio) {
ret = submit_one_bio(epd->bio, 0, 0);
- BUG_ON(ret < 0); /* -ENOMEM */
+ /*
+ * Clean up of epd->bio is handled by its endio function.
+ * And endio is either triggered by successful bio execution
+ * or the error handler of submit bio hook.
+ * So at this point, no matter what happened, we don't need
+ * to clean up epd->bio.
+ */
epd->bio = NULL;
}
+ return ret;
}
-int __init extent_io_init(void)
+int __init extent_state_cache_init(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0,
SLAB_MEM_SPREAD, NULL);
if (!extent_state_cache)
return -ENOMEM;
+ return 0;
+}
+int __init extent_io_init(void)
+{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
- goto free_state_cache;
+ return -ENOMEM;
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_io_bio, bio),
@@ -211,35 +235,68 @@ free_bioset:
free_buffer_cache:
kmem_cache_destroy(extent_buffer_cache);
extent_buffer_cache = NULL;
+ return -ENOMEM;
+}
-free_state_cache:
+void __cold extent_state_cache_exit(void)
+{
+ btrfs_extent_state_leak_debug_check();
kmem_cache_destroy(extent_state_cache);
- extent_state_cache = NULL;
- return -ENOMEM;
}
void __cold extent_io_exit(void)
{
- btrfs_leak_debug_check();
+ btrfs_extent_buffer_leak_debug_check();
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
*/
rcu_barrier();
- kmem_cache_destroy(extent_state_cache);
kmem_cache_destroy(extent_buffer_cache);
bioset_exit(&btrfs_bioset);
}
-void extent_io_tree_init(struct extent_io_tree *tree,
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner,
void *private_data)
{
+ tree->fs_info = fs_info;
tree->state = RB_ROOT;
tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
tree->private_data = private_data;
+ tree->owner = owner;
+}
+
+void extent_io_tree_release(struct extent_io_tree *tree)
+{
+ spin_lock(&tree->lock);
+ /*
+ * Do a single barrier for the waitqueue_active check here, the state
+ * of the waitqueue should not change once extent_io_tree_release is
+ * called.
+ */
+ smp_mb();
+ while (!RB_EMPTY_ROOT(&tree->state)) {
+ struct rb_node *node;
+ struct extent_state *state;
+
+ node = rb_first(&tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ /*
+ * btree io trees aren't supposed to have tasks waiting for
+ * changes in the flags of extent states ever.
+ */
+ ASSERT(!waitqueue_active(&state->wq));
+ free_extent_state(state);
+
+ cond_resched_lock(&tree->lock);
+ }
+ spin_unlock(&tree->lock);
}
static struct extent_state *alloc_extent_state(gfp_t mask)
@@ -312,6 +369,24 @@ do_insert:
return NULL;
}
+/**
+ * __etree_search - searche @tree for an entry that contains @offset. Such
+ * entry would have entry->start <= offset && entry->end >= offset.
+ *
+ * @tree - the tree to search
+ * @offset - offset that should fall within an entry in @tree
+ * @next_ret - pointer to the first entry whose range ends after @offset
+ * @prev - pointer to the first entry whose range begins before @offset
+ * @p_ret - pointer where new node should be anchored (used when inserting an
+ * entry in the tree)
+ * @parent_ret - points to entry which would have been the parent of the entry,
+ * containing @offset
+ *
+ * This function returns a pointer to the entry that contains @offset byte
+ * address. If no such entry exists, then NULL is returned and the other
+ * pointer arguments to the function are filled, otherwise the found entry is
+ * returned and other pointers are left untouched.
+ */
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
struct rb_node **next_ret,
struct rb_node **prev_ret,
@@ -400,7 +475,7 @@ static void merge_state(struct extent_io_tree *tree,
struct extent_state *other;
struct rb_node *other_node;
- if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+ if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
return;
other_node = rb_prev(&state->rb_node);
@@ -457,9 +532,11 @@ static int insert_state(struct extent_io_tree *tree,
{
struct rb_node *node;
- if (end < start)
- WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
- end, start);
+ if (end < start) {
+ btrfs_err(tree->fs_info,
+ "insert state: end < start %llu %llu", end, start);
+ WARN_ON(1);
+ }
state->start = start;
state->end = end;
@@ -469,7 +546,8 @@ static int insert_state(struct extent_io_tree *tree,
if (node) {
struct extent_state *found;
found = rb_entry(node, struct extent_state, rb_node);
- pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n",
+ btrfs_err(tree->fs_info,
+ "found node %llu %llu on insert of %llu %llu",
found->start, found->end, start, end);
return -EEXIST;
}
@@ -611,6 +689,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int clear = 0;
btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
if (bits & EXTENT_DELALLOC)
bits |= EXTENT_NORESERVE;
@@ -618,7 +697,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (delete)
bits |= ~EXTENT_CTLBITS;
- if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+ if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
clear = 1;
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
@@ -850,7 +929,7 @@ static void cache_state(struct extent_state *state,
struct extent_state **cached_ptr)
{
return cache_state_if_flags(state, cached_ptr,
- EXTENT_IOBITS | EXTENT_BOUNDARY);
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
}
/*
@@ -880,6 +959,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u64 last_end;
btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
@@ -1112,6 +1192,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
bool first_iteration = true;
btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
+ clear_bits);
again:
if (!prealloc) {
@@ -1311,6 +1393,13 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
changeset);
}
+int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits)
+{
+ return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
+ GFP_NOWAIT, NULL);
+}
+
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached)
@@ -1478,15 +1567,128 @@ out:
return ret;
}
+/**
+ * find_first_clear_extent_bit - find the first range that has @bits not set.
+ * This range could start before @start.
+ *
+ * @tree - the tree to search
+ * @start - the offset at/after which the found extent should start
+ * @start_ret - records the beginning of the range
+ * @end_ret - records the end of the range (inclusive)
+ * @bits - the set of bits which must be unset
+ *
+ * Since unallocated range is also considered one which doesn't have the bits
+ * set it's possible that @end_ret contains -1, this happens in case the range
+ * spans (last_range_end, end of device]. In this case it's up to the caller to
+ * trim @end_ret to the appropriate size.
+ */
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits)
+{
+ struct extent_state *state;
+ struct rb_node *node, *prev = NULL, *next;
+
+ spin_lock(&tree->lock);
+
+ /* Find first extent with bits cleared */
+ while (1) {
+ node = __etree_search(tree, start, &next, &prev, NULL, NULL);
+ if (!node) {
+ node = next;
+ if (!node) {
+ /*
+ * We are past the last allocated chunk,
+ * set start at the end of the last extent. The
+ * device alloc tree should never be empty so
+ * prev is always set.
+ */
+ ASSERT(prev);
+ state = rb_entry(prev, struct extent_state, rb_node);
+ *start_ret = state->end + 1;
+ *end_ret = -1;
+ goto out;
+ }
+ }
+ /*
+ * At this point 'node' either contains 'start' or start is
+ * before 'node'
+ */
+ state = rb_entry(node, struct extent_state, rb_node);
+
+ if (in_range(start, state->start, state->end - state->start + 1)) {
+ if (state->state & bits) {
+ /*
+ * |--range with bits sets--|
+ * |
+ * start
+ */
+ start = state->end + 1;
+ } else {
+ /*
+ * 'start' falls within a range that doesn't
+ * have the bits set, so take its start as
+ * the beginning of the desired range
+ *
+ * |--range with bits cleared----|
+ * |
+ * start
+ */
+ *start_ret = state->start;
+ break;
+ }
+ } else {
+ /*
+ * |---prev range---|---hole/unset---|---node range---|
+ * |
+ * start
+ *
+ * or
+ *
+ * |---hole/unset--||--first node--|
+ * 0 |
+ * start
+ */
+ if (prev) {
+ state = rb_entry(prev, struct extent_state,
+ rb_node);
+ *start_ret = state->end + 1;
+ } else {
+ *start_ret = 0;
+ }
+ break;
+ }
+ }
+
+ /*
+ * Find the longest stretch from start until an entry which has the
+ * bits set
+ */
+ while (1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->end >= start && !(state->state & bits)) {
+ *end_ret = state->end;
+ } else {
+ *end_ret = state->start - 1;
+ break;
+ }
+
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
/*
* find a contiguous range of bytes in the file marked as delalloc, not
* more than 'max_bytes'. start and end are used to return the range,
*
* true is returned if we find something, false if nothing was in the tree
*/
-static noinline bool find_delalloc_range(struct extent_io_tree *tree,
- u64 *start, u64 *end, u64 max_bytes,
- struct extent_state **cached_state)
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state)
{
struct rb_node *node;
struct extent_state *state;
@@ -1588,10 +1790,10 @@ static noinline int lock_delalloc_pages(struct inode *inode,
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
- struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end)
{
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
@@ -1604,8 +1806,8 @@ again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
- found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
- max_bytes, &cached_state);
+ found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+ max_bytes, &cached_state);
if (!found || delalloc_end <= *start) {
*start = delalloc_start;
*end = delalloc_end;
@@ -1707,7 +1909,7 @@ static int __process_pages_contig(struct address_space *mapping,
if (page_ops & PAGE_SET_PRIVATE2)
SetPagePrivate2(pages[i]);
- if (pages[i] == locked_page) {
+ if (locked_page && pages[i] == locked_page) {
put_page(pages[i]);
pages_locked++;
continue;
@@ -1746,9 +1948,9 @@ out:
}
void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
- u64 delalloc_end, struct page *locked_page,
- unsigned clear_bits,
- unsigned long page_ops)
+ struct page *locked_page,
+ unsigned clear_bits,
+ unsigned long page_ops)
{
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
NULL);
@@ -1822,8 +2024,8 @@ out:
* set the private field for a given byte offset in the tree. If there isn't
* an extent_state there already, this does nothing.
*/
-static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec)
+int set_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record *failrec)
{
struct rb_node *node;
struct extent_state *state;
@@ -1850,8 +2052,8 @@ out:
return ret;
}
-static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record **failrec)
+int get_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record **failrec)
{
struct rb_node *node;
struct extent_state *state;
@@ -2061,9 +2263,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
return 0;
}
-int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int mirror_num)
+int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
u64 start = eb->start;
int i, num_pages = num_extent_pages(eb);
int ret = 0;
@@ -2342,7 +2544,6 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
bio = btrfs_io_bio_alloc(1);
bio->bi_end_io = endio_func;
bio->bi_iter.bi_sector = failrec->logical >> 9;
- bio_set_dev(bio, fs_info->fs_devices->latest_bdev);
bio->bi_iter.bi_size = 0;
bio->bi_private = data;
@@ -2409,7 +2610,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
read_mode, failrec->this_mirror, failrec->in_validation);
status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
- failrec->bio_flags, 0);
+ failrec->bio_flags);
if (status) {
free_io_failure(failure_tree, tree, failrec);
bio_put(bio);
@@ -2451,11 +2652,10 @@ static void end_bio_extent_writepage(struct bio *bio)
struct bio_vec *bvec;
u64 start;
u64 end;
- int i;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i, iter_all) {
+ bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2523,11 +2723,10 @@ static void end_bio_extent_readpage(struct bio *bio)
u64 extent_len = 0;
int mirror;
int ret;
- int i;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i, iter_all) {
+ bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2607,8 +2806,6 @@ static void end_bio_extent_readpage(struct bio *bio)
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
&eb->bflags))
btree_readahead_hook(eb, -EIO);
-
- ret = -EIO;
}
readpage_ok:
if (likely(uptodate)) {
@@ -2673,12 +2870,11 @@ static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
* never fail. We're returning a bio right now but you can call btrfs_io_bio
* for the appropriate container_of magic
*/
-struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
+struct bio *btrfs_bio_alloc(u64 first_byte)
{
struct bio *bio;
bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = first_byte >> 9;
btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
@@ -2733,7 +2929,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
* a contiguous page to the previous one
* @size: portion of page that we want to write
* @offset: starting offset in the page
- * @bdev: attach newly created bios to this bdev
* @bio_ret: must be valid pointer, newly allocated bio will be stored there
* @end_io_func: end_io callback for new bio
* @mirror_num: desired mirror to read/write
@@ -2744,7 +2939,6 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
struct writeback_control *wbc,
struct page *page, u64 offset,
size_t size, unsigned long pg_offset,
- struct block_device *bdev,
struct bio **bio_ret,
bio_end_io_t end_io_func,
int mirror_num,
@@ -2784,20 +2978,24 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
bio = NULL;
} else {
if (wbc)
- wbc_account_io(wbc, page, page_size);
+ wbc_account_cgroup_owner(wbc, page, page_size);
return 0;
}
}
- bio = btrfs_bio_alloc(bdev, offset);
+ bio = btrfs_bio_alloc(offset);
bio_add_page(bio, page, page_size, pg_offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
bio->bi_write_hint = page->mapping->host->i_write_hint;
bio->bi_opf = opf;
if (wbc) {
+ struct block_device *bdev;
+
+ bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
+ bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
- wbc_account_io(wbc, page, page_size);
+ wbc_account_cgroup_owner(wbc, page, page_size);
}
*bio_ret = bio;
@@ -2877,7 +3075,6 @@ static int __do_readpage(struct extent_io_tree *tree,
u64 block_start;
u64 cur_end;
struct extent_map *em;
- struct block_device *bdev;
int ret = 0;
int nr = 0;
size_t pg_offset = 0;
@@ -2954,7 +3151,6 @@ static int __do_readpage(struct extent_io_tree *tree,
offset = em->block_start + extent_offset;
disk_io_size = iosize;
}
- bdev = em->bdev;
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
@@ -3044,7 +3240,7 @@ static int __do_readpage(struct extent_io_tree *tree,
ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
page, offset, disk_io_size,
- pg_offset, bdev, bio,
+ pg_offset, bio,
end_bio_extent_readpage, mirror_num,
*bio_flags,
this_bio_flag,
@@ -3069,7 +3265,7 @@ out:
return ret;
}
-static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
+static inline void contiguous_readpages(struct extent_io_tree *tree,
struct page *pages[], int nr_pages,
u64 start, u64 end,
struct extent_map **em_cached,
@@ -3077,21 +3273,10 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
unsigned long *bio_flags,
u64 *prev_em_start)
{
- struct inode *inode;
- struct btrfs_ordered_extent *ordered;
+ struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
int index;
- inode = pages[0]->mapping->host;
- while (1) {
- lock_extent(tree, start, end);
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
- end - start + 1);
- if (!ordered)
- break;
- unlock_extent(tree, start, end);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- }
+ btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
__do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
@@ -3100,46 +3285,6 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
}
}
-static void __extent_readpages(struct extent_io_tree *tree,
- struct page *pages[],
- int nr_pages,
- struct extent_map **em_cached,
- struct bio **bio, unsigned long *bio_flags,
- u64 *prev_em_start)
-{
- u64 start = 0;
- u64 end = 0;
- u64 page_start;
- int index;
- int first_index = 0;
-
- for (index = 0; index < nr_pages; index++) {
- page_start = page_offset(pages[index]);
- if (!end) {
- start = page_start;
- end = start + PAGE_SIZE - 1;
- first_index = index;
- } else if (end + 1 == page_start) {
- end += PAGE_SIZE;
- } else {
- __do_contiguous_readpages(tree, &pages[first_index],
- index - first_index, start,
- end, em_cached,
- bio, bio_flags,
- prev_em_start);
- start = page_start;
- end = start + PAGE_SIZE - 1;
- first_index = index;
- }
- }
-
- if (end)
- __do_contiguous_readpages(tree, &pages[first_index],
- index - first_index, start,
- end, em_cached, bio,
- bio_flags, prev_em_start);
-}
-
static int __extent_read_full_page(struct extent_io_tree *tree,
struct page *page,
get_extent_t *get_extent,
@@ -3147,22 +3292,12 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
unsigned long *bio_flags,
unsigned int read_flags)
{
- struct inode *inode = page->mapping->host;
- struct btrfs_ordered_extent *ordered;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
int ret;
- while (1) {
- lock_extent(tree, start, end);
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
- PAGE_SIZE);
- if (!ordered)
- break;
- unlock_extent(tree, start, end);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- }
+ btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
bio_flags, read_flags, NULL);
@@ -3203,7 +3338,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
struct page *page, struct writeback_control *wbc,
u64 delalloc_start, unsigned long *nr_written)
{
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
u64 page_end = delalloc_start + PAGE_SIZE - 1;
bool found;
u64 delalloc_to_write = 0;
@@ -3213,8 +3347,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
while (delalloc_end < page_end) {
- found = find_lock_delalloc_range(inode, tree,
- page,
+ found = find_lock_delalloc_range(inode, page,
&delalloc_start,
&delalloc_end);
if (!found) {
@@ -3223,7 +3356,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
}
ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
delalloc_end, &page_started, nr_written, wbc);
- /* File system has been set read-only */
if (ret) {
SetPageError(page);
/*
@@ -3285,7 +3417,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
struct extent_page_data *epd,
loff_t i_size,
unsigned long nr_written,
- unsigned int write_flags, int *nr_ret)
+ int *nr_ret)
{
struct extent_io_tree *tree = epd->tree;
u64 start = page_offset(page);
@@ -3296,11 +3428,11 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
u64 block_start;
u64 iosize;
struct extent_map *em;
- struct block_device *bdev;
size_t pg_offset = 0;
size_t blocksize;
int ret = 0;
int nr = 0;
+ const unsigned int write_flags = wbc_to_write_flags(wbc);
bool compressed;
ret = btrfs_writepage_cow_fixup(page, start, page_end);
@@ -3354,7 +3486,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
iosize = min(em_end - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
offset = em->block_start + extent_offset;
- bdev = em->bdev;
block_start = em->block_start;
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
free_extent_map(em);
@@ -3396,7 +3527,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
page, offset, iosize, pg_offset,
- bdev, &epd->bio,
+ &epd->bio,
end_bio_extent_writepage,
0, 0, 0, false);
if (ret) {
@@ -3419,6 +3550,9 @@ done:
* records are inserted to lock ranges in the tree, and as dirty areas
* are found, they are marked writeback. Then the lock bits are removed
* and the end_io handler clears the writeback ranges
+ *
+ * Return 0 if everything goes well.
+ * Return <0 for error.
*/
static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct extent_page_data *epd)
@@ -3431,11 +3565,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
size_t pg_offset = 0;
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
- unsigned int write_flags = 0;
unsigned long nr_written = 0;
- write_flags = wbc_to_write_flags(wbc);
-
trace___extent_writepage(page, inode, wbc);
WARN_ON(!PageLocked(page));
@@ -3473,7 +3604,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
ret = __extent_writepage_io(inode, page, wbc, epd,
- i_size, nr_written, write_flags, &nr);
+ i_size, nr_written, &nr);
if (ret == 1)
goto done_unlocked;
@@ -3488,6 +3619,7 @@ done:
end_extent_writepage(page, ret, start, page_end);
}
unlock_page(page);
+ ASSERT(ret <= 0);
return ret;
done_unlocked:
@@ -3500,18 +3632,33 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
TASK_UNINTERRUPTIBLE);
}
-static noinline_for_stack int
-lock_extent_buffer_for_io(struct extent_buffer *eb,
- struct btrfs_fs_info *fs_info,
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ smp_mb__after_atomic();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
+/*
+ * Lock eb pages and flush the bio if we can't the locks
+ *
+ * Return 0 if nothing went wrong
+ * Return >0 is same as 0, except bio is not submitted
+ * Return <0 if something went wrong, no page is locked
+ */
+static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
struct extent_page_data *epd)
{
- int i, num_pages;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ int i, num_pages, failed_page_nr;
int flush = 0;
int ret = 0;
if (!btrfs_try_tree_write_lock(eb)) {
+ ret = flush_write_bio(epd);
+ if (ret < 0)
+ return ret;
flush = 1;
- flush_write_bio(epd);
btrfs_tree_lock(eb);
}
@@ -3520,7 +3667,9 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
if (!epd->sync_io)
return 0;
if (!flush) {
- flush_write_bio(epd);
+ ret = flush_write_bio(epd);
+ if (ret < 0)
+ return ret;
flush = 1;
}
while (1) {
@@ -3561,7 +3710,14 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
if (!trylock_page(p)) {
if (!flush) {
- flush_write_bio(epd);
+ int err;
+
+ err = flush_write_bio(epd);
+ if (err < 0) {
+ ret = err;
+ failed_page_nr = i;
+ goto err_unlock;
+ }
flush = 1;
}
lock_page(p);
@@ -3569,24 +3725,45 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
}
return ret;
-}
-
-static void end_extent_buffer_writeback(struct extent_buffer *eb)
-{
- clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+err_unlock:
+ /* Unlock already locked pages */
+ for (i = 0; i < failed_page_nr; i++)
+ unlock_page(eb->pages[i]);
+ /*
+ * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
+ * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
+ * be made and undo everything done before.
+ */
+ btrfs_tree_lock(eb);
+ spin_lock(&eb->refs_lock);
+ set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+ end_extent_buffer_writeback(eb);
+ spin_unlock(&eb->refs_lock);
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
+ fs_info->dirty_metadata_batch);
+ btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+ btrfs_tree_unlock(eb);
+ return ret;
}
static void set_btree_ioerr(struct page *page)
{
struct extent_buffer *eb = (struct extent_buffer *)page->private;
+ struct btrfs_fs_info *fs_info;
SetPageError(page);
if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
return;
/*
+ * If we error out, we should add back the dirty_metadata_bytes
+ * to make it consistent.
+ */
+ fs_info = eb->fs_info;
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ eb->len, fs_info->dirty_metadata_batch);
+
+ /*
* If writeback for a btree extent that doesn't belong to a log tree
* failed, increment the counter transaction->eb_write_errors.
* We do this because while the transaction is running and before it's
@@ -3643,11 +3820,11 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
{
struct bio_vec *bvec;
struct extent_buffer *eb;
- int i, done;
+ int done;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i, iter_all) {
+ bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
eb = (struct extent_buffer *)page->private;
@@ -3672,11 +3849,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
}
static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
- struct btrfs_fs_info *fs_info,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
- struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
u64 offset = eb->start;
u32 nritems;
@@ -3701,7 +3877,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
*/
start = btrfs_item_nr_offset(nritems);
- end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb);
+ end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
memzero_extent_buffer(eb, start, end - start);
}
@@ -3711,7 +3887,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
- p, offset, PAGE_SIZE, 0, bdev,
+ p, offset, PAGE_SIZE, 0,
&epd->bio,
end_bio_extent_buffer_writepage,
0, 0, 0, false);
@@ -3744,7 +3920,6 @@ int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
- struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
struct extent_buffer *eb, *prev_eb = NULL;
struct extent_page_data epd = {
.bio = NULL,
@@ -3819,13 +3994,17 @@ retry:
continue;
prev_eb = eb;
- ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
+ ret = lock_extent_buffer_for_io(eb, &epd);
if (!ret) {
free_extent_buffer(eb);
continue;
+ } else if (ret < 0) {
+ done = 1;
+ free_extent_buffer(eb);
+ break;
}
- ret = write_one_eb(eb, fs_info, wbc, &epd);
+ ret = write_one_eb(eb, wbc, &epd);
if (ret) {
done = 1;
free_extent_buffer(eb);
@@ -3852,7 +4031,12 @@ retry:
index = 0;
goto retry;
}
- flush_write_bio(&epd);
+ ASSERT(ret <= 0);
+ if (ret < 0) {
+ end_write_bio(&epd, ret);
+ return ret;
+ }
+ ret = flush_write_bio(&epd);
return ret;
}
@@ -3940,7 +4124,7 @@ retry:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- done_index = page->index;
+ done_index = page->index + 1;
/*
* At this point we hold neither the i_pages lock nor
* the page lock: the page may be truncated or
@@ -3949,7 +4133,8 @@ retry:
* tmpfs file mapping
*/
if (!trylock_page(page)) {
- flush_write_bio(epd);
+ ret = flush_write_bio(epd);
+ BUG_ON(ret < 0);
lock_page(page);
}
@@ -3959,8 +4144,10 @@ retry:
}
if (wbc->sync_mode != WB_SYNC_NONE) {
- if (PageWriteback(page))
- flush_write_bio(epd);
+ if (PageWriteback(page)) {
+ ret = flush_write_bio(epd);
+ BUG_ON(ret < 0);
+ }
wait_on_page_writeback(page);
}
@@ -3971,22 +4158,7 @@ retry:
}
ret = __extent_writepage(page, wbc, epd);
-
- if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
- unlock_page(page);
- ret = 0;
- }
if (ret < 0) {
- /*
- * done_index is set past this page,
- * so media errors will not choke
- * background writeout for the entire
- * file. This has consequences for
- * range_cyclic semantics (ie. it may
- * not be suitable for data integrity
- * writeout).
- */
- done_index = page->index + 1;
done = 1;
break;
}
@@ -4029,8 +4201,14 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
};
ret = __extent_writepage(page, wbc, &epd);
+ ASSERT(ret <= 0);
+ if (ret < 0) {
+ end_write_bio(&epd, ret);
+ return ret;
+ }
- flush_write_bio(&epd);
+ ret = flush_write_bio(&epd);
+ ASSERT(ret <= 0);
return ret;
}
@@ -4055,8 +4233,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
.nr_to_write = nr_pages * 2,
.range_start = start,
.range_end = end + 1,
+ /* We're called from an async helper function */
+ .punt_to_cgroup = 1,
+ .no_cgroup_owner = 1,
};
+ wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
while (start <= end) {
page = find_get_page(mapping, start >> PAGE_SHIFT);
if (clear_page_dirty_for_io(page))
@@ -4070,7 +4252,13 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
start += PAGE_SIZE;
}
- flush_write_bio(&epd);
+ ASSERT(ret <= 0);
+ if (ret == 0)
+ ret = flush_write_bio(&epd);
+ else
+ end_write_bio(&epd, ret);
+
+ wbc_detach_inode(&wbc_writepages);
return ret;
}
@@ -4086,7 +4274,12 @@ int extent_writepages(struct address_space *mapping,
};
ret = extent_write_cache_pages(mapping, wbc, &epd);
- flush_write_bio(&epd);
+ ASSERT(ret <= 0);
+ if (ret < 0) {
+ end_write_bio(&epd, ret);
+ return ret;
+ }
+ ret = flush_write_bio(&epd);
return ret;
}
@@ -4102,6 +4295,8 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
u64 prev_em_start = (u64)-1;
while (!list_empty(pages)) {
+ u64 contig_end = 0;
+
for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
struct page *page = lru_to_page(pages);
@@ -4110,14 +4305,22 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
if (add_to_page_cache_lru(page, mapping, page->index,
readahead_gfp_mask(mapping))) {
put_page(page);
- continue;
+ break;
}
pagepool[nr++] = page;
+ contig_end = page_offset(page) + PAGE_SIZE - 1;
}
- __extent_readpages(tree, pagepool, nr, &em_cached, &bio,
- &bio_flags, &prev_em_start);
+ if (nr) {
+ u64 contig_start = page_offset(pagepool[0]);
+
+ ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
+
+ contiguous_readpages(tree, pagepool, nr, contig_start,
+ contig_end, &em_cached, &bio, &bio_flags,
+ &prev_em_start);
+ }
}
if (em_cached)
@@ -4147,10 +4350,8 @@ int extent_invalidatepage(struct extent_io_tree *tree,
lock_extent_bits(tree, start, end, &cached_state);
wait_on_page_writeback(page);
- clear_extent_bit(tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING,
- 1, 1, &cached_state);
+ clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 1, 1, &cached_state);
return 0;
}
@@ -4166,10 +4367,9 @@ static int try_release_extent_state(struct extent_io_tree *tree,
u64 end = start + PAGE_SIZE - 1;
int ret = 1;
- if (test_range_bit(tree, start, end,
- EXTENT_IOBITS, 0, NULL))
+ if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
ret = 0;
- else {
+ } else {
/*
* at this point we can safely clear everything except the
* locked bit and the nodatasum bit
@@ -4222,8 +4422,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
}
if (!test_range_bit(tree, em->start,
extent_map_end(em) - 1,
- EXTENT_LOCKED | EXTENT_WRITEBACK,
- 0, NULL)) {
+ EXTENT_LOCKED, 0, NULL)) {
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&btrfs_inode->runtime_flags);
remove_extent_mapping(map, em);
@@ -4372,8 +4571,7 @@ try_submit_last:
* In this case, the first extent range will be cached but not emitted.
* So we must emit it before ending extent_fiemap().
*/
-static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
- struct fiemap_extent_info *fieinfo,
+static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
struct fiemap_cache *cache)
{
int ret;
@@ -4407,6 +4605,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct btrfs_path *path;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct fiemap_cache cache = { 0 };
+ struct ulist *roots;
+ struct ulist *tmp_ulist;
int end = 0;
u64 em_start = 0;
u64 em_len = 0;
@@ -4420,6 +4620,13 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return -ENOMEM;
path->leave_spinning = 1;
+ roots = ulist_alloc(GFP_KERNEL);
+ tmp_ulist = ulist_alloc(GFP_KERNEL);
+ if (!roots || !tmp_ulist) {
+ ret = -ENOMEM;
+ goto out_free_ulist;
+ }
+
start = round_down(start, btrfs_inode_sectorsize(inode));
len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
@@ -4430,8 +4637,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(inode)), -1, 0);
if (ret < 0) {
- btrfs_free_path(path);
- return ret;
+ goto out_free_ulist;
} else {
WARN_ON(!ret);
if (ret == 1)
@@ -4540,7 +4746,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
ret = btrfs_check_shared(root,
btrfs_ino(BTRFS_I(inode)),
- bytenr);
+ bytenr, roots, tmp_ulist);
if (ret < 0)
goto out_free;
if (ret)
@@ -4580,12 +4786,16 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
out_free:
if (!ret)
- ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache);
+ ret = emit_last_fiemap_cache(fieinfo, &cache);
free_extent_map(em);
out:
- btrfs_free_path(path);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
&cached_state);
+
+out_free_ulist:
+ btrfs_free_path(path);
+ ulist_free(roots);
+ ulist_free(tmp_ulist);
return ret;
}
@@ -4672,13 +4882,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
eb->fs_info = fs_info;
eb->bflags = 0;
rwlock_init(&eb->lock);
- atomic_set(&eb->write_locks, 0);
- atomic_set(&eb->read_locks, 0);
atomic_set(&eb->blocking_readers, 0);
- atomic_set(&eb->blocking_writers, 0);
- atomic_set(&eb->spinning_readers, 0);
- atomic_set(&eb->spinning_writers, 0);
- eb->lock_nested = 0;
+ eb->blocking_writers = 0;
+ eb->lock_nested = false;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
@@ -4695,6 +4901,13 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
> MAX_INLINE_EXTENT_BUFFER_SIZE);
BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
+#ifdef CONFIG_BTRFS_DEBUG
+ eb->spinning_writers = 0;
+ atomic_set(&eb->spinning_readers, 0);
+ atomic_set(&eb->read_locks, 0);
+ eb->write_locks = 0;
+#endif
+
return eb;
}
@@ -4861,12 +5074,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
eb = alloc_dummy_extent_buffer(fs_info, start);
if (!eb)
- return NULL;
+ return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
again:
ret = radix_tree_preload(GFP_NOFS);
- if (ret)
+ if (ret) {
+ exists = ERR_PTR(ret);
goto free_eb;
+ }
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
start >> PAGE_SHIFT, eb);
@@ -5183,8 +5398,7 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
}
}
-int read_extent_buffer_pages(struct extent_io_tree *tree,
- struct extent_buffer *eb, int wait, int mirror_num)
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
{
int i;
struct page *page;
@@ -5196,6 +5410,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
unsigned long num_reads = 0;
struct bio *bio = NULL;
unsigned long bio_flags = 0;
+ struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -5746,13 +5961,13 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
btrfs_err(fs_info,
"memmove bogus src_offset %lu move len %lu dst len %lu",
src_offset, len, dst->len);
- BUG_ON(1);
+ BUG();
}
if (dst_offset + len > dst->len) {
btrfs_err(fs_info,
"memmove bogus dst_offset %lu move len %lu dst len %lu",
dst_offset, len, dst->len);
- BUG_ON(1);
+ BUG();
}
while (len > 0) {
@@ -5793,13 +6008,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
btrfs_err(fs_info,
"memmove bogus src_offset %lu move len %lu len %lu",
src_offset, len, dst->len);
- BUG_ON(1);
+ BUG();
}
if (dst_offset + len > dst->len) {
btrfs_err(fs_info,
"memmove bogus dst_offset %lu move len %lu len %lu",
dst_offset, len, dst->len);
- BUG_ON(1);
+ BUG();
}
if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 08749e0b9c32..a8551a1f56e2 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -7,28 +7,6 @@
#include <linux/refcount.h>
#include "ulist.h"
-/* bits for the extent state */
-#define EXTENT_DIRTY (1U << 0)
-#define EXTENT_WRITEBACK (1U << 1)
-#define EXTENT_UPTODATE (1U << 2)
-#define EXTENT_LOCKED (1U << 3)
-#define EXTENT_NEW (1U << 4)
-#define EXTENT_DELALLOC (1U << 5)
-#define EXTENT_DEFRAG (1U << 6)
-#define EXTENT_BOUNDARY (1U << 9)
-#define EXTENT_NODATASUM (1U << 10)
-#define EXTENT_CLEAR_META_RESV (1U << 11)
-#define EXTENT_NEED_WAIT (1U << 12)
-#define EXTENT_DAMAGED (1U << 13)
-#define EXTENT_NORESERVE (1U << 14)
-#define EXTENT_QGROUP_RESERVED (1U << 15)
-#define EXTENT_CLEAR_DATA_RESV (1U << 16)
-#define EXTENT_DELALLOC_NEW (1U << 17)
-#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
-#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
- EXTENT_CLEAR_DATA_RESV)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
-
/*
* flags for bio submission. The high bits indicate the compression
* type for this bio
@@ -82,15 +60,11 @@ enum {
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
-struct extent_state;
struct btrfs_root;
struct btrfs_inode;
struct btrfs_io_bio;
struct io_failure_record;
-
-typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset);
+struct extent_io_tree;
typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
struct bio *bio, u64 bio_offset);
@@ -100,37 +74,13 @@ struct extent_io_ops {
* The following callbacks must be always defined, the function
* pointer will be called unconditionally.
*/
- extent_submit_bio_hook_t *submit_bio_hook;
+ blk_status_t (*submit_bio_hook)(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
struct page *page, u64 start, u64 end,
int mirror);
};
-struct extent_io_tree {
- struct rb_root state;
- void *private_data;
- u64 dirty_bytes;
- int track_uptodate;
- spinlock_t lock;
- const struct extent_io_ops *ops;
-};
-
-struct extent_state {
- u64 start;
- u64 end; /* inclusive */
- struct rb_node rb_node;
-
- /* ADD NEW ELEMENTS AFTER THIS */
- wait_queue_head_t wq;
- refcount_t refs;
- unsigned state;
-
- struct io_failure_record *failrec;
-
-#ifdef CONFIG_BTRFS_DEBUG
- struct list_head leak_list;
-#endif
-};
#define INLINE_EXTENT_BUFFER_PAGES 16
#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
@@ -146,14 +96,9 @@ struct extent_buffer {
struct rcu_head rcu_head;
pid_t lock_owner;
- /* count of read lock holders on the extent buffer */
- atomic_t write_locks;
- atomic_t read_locks;
- atomic_t blocking_writers;
+ int blocking_writers;
atomic_t blocking_readers;
- atomic_t spinning_readers;
- atomic_t spinning_writers;
- short lock_nested;
+ bool lock_nested;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
short log_index;
@@ -171,6 +116,10 @@ struct extent_buffer {
wait_queue_head_t read_lock_wq;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
+ int spinning_writers;
+ atomic_t spinning_readers;
+ atomic_t read_locks;
+ int write_locks;
struct list_head leak_list;
#endif
};
@@ -239,145 +188,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
u64 start, u64 len,
int create);
-void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached);
-static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return lock_extent_bits(tree, start, end, NULL);
-}
-
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
-int __init extent_io_init(void);
-void __cold extent_io_exit(void);
-
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end,
- u64 max_bytes, unsigned bits, int contig);
-
-void free_extent_state(struct extent_state *state);
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int filled,
- struct extent_state *cached_state);
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset);
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached);
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached, gfp_t mask,
- struct extent_changeset *changeset);
-
-static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
-}
-
-static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_NOFS, NULL);
-}
-
-static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
- u64 start, u64 end, struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_ATOMIC, NULL);
-}
-
-static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits)
-{
- int wake = 0;
-
- if (bits & EXTENT_LOCKED)
- wake = 1;
-
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
-}
-
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset);
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask);
-
-static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits)
-{
- return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
-}
-
-static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, GFP_NOFS, NULL);
-}
-
-static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
- NULL, mask);
-}
-
-static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
-{
- return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, cached);
-}
-
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned clear_bits,
- struct extent_state **cached_state);
-
-static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned int extra_bits,
- struct extent_state **cached_state)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
- NULL, cached_state, GFP_NOFS);
-}
-
-static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- NULL, cached_state, GFP_NOFS);
-}
-
-static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
- u64 end)
-{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
- GFP_NOFS);
-}
-
-static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
- cached_state, mask);
-}
-
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits,
- struct extent_state **cached_state);
-int extent_invalidatepage(struct extent_io_tree *tree,
- struct page *page, unsigned long offset);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
@@ -405,8 +220,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_NONE 0
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
-int read_extent_buffer_pages(struct extent_io_tree *tree,
- struct extent_buffer *eb, int wait,
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
int mirror_num);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
@@ -416,11 +230,6 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
(eb->start >> PAGE_SHIFT);
}
-static inline void extent_buffer_get(struct extent_buffer *eb)
-{
- atomic_inc(&eb->refs);
-}
-
static inline int extent_buffer_uptodate(struct extent_buffer *eb)
{
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -468,10 +277,10 @@ int map_private_extent_buffer(const struct extent_buffer *eb,
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
- u64 delalloc_end, struct page *locked_page,
- unsigned bits_to_clear,
- unsigned long page_ops);
-struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte);
+ struct page *locked_page,
+ unsigned bits_to_clear,
+ unsigned long page_ops);
+struct bio *btrfs_bio_alloc(u64 first_byte);
struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
@@ -482,13 +291,8 @@ struct btrfs_inode;
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num);
-int clean_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree, u64 start,
- struct page *page, u64 ino, unsigned int pg_offset);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
-int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int mirror_num);
+int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num);
/*
* When IO fails, either with EIO or csum verification fails, we
@@ -510,25 +314,17 @@ struct io_failure_record {
};
-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
- u64 end);
-int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
- struct io_failure_record **failrec_ret);
bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
struct io_failure_record *failrec, int fail_mirror);
struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
struct io_failure_record *failrec,
struct page *page, int pg_offset, int icsum,
bio_end_io_t *endio_func, void *data);
-int free_io_failure(struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree,
- struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree,
+bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
u64 *end);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
-
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 928f729c55ba..6f417ff68980 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -4,6 +4,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include "ctree.h"
+#include "volumes.h"
#include "extent_map.h"
#include "compression.h"
@@ -213,9 +214,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
prev->block_start != EXTENT_MAP_DELALLOC);
+ if (prev->map_lookup || next->map_lookup)
+ ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) &&
+ test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags));
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
- prev->bdev == next->bdev &&
+ prev->map_lookup == next->map_lookup &&
((next->block_start == EXTENT_MAP_HOLE &&
prev->block_start == EXTENT_MAP_HOLE) ||
(next->block_start == EXTENT_MAP_INLINE &&
@@ -337,6 +342,37 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
try_merge_map(tree, em);
}
+static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
+{
+ struct map_lookup *map = em->map_lookup;
+ u64 stripe_size = em->orig_block_len;
+ int i;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_bio_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ set_extent_bits_nowait(&device->alloc_state, stripe->physical,
+ stripe->physical + stripe_size - 1, bits);
+ }
+}
+
+static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
+{
+ struct map_lookup *map = em->map_lookup;
+ u64 stripe_size = em->orig_block_len;
+ int i;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_bio_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ __clear_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + stripe_size - 1, bits,
+ 0, 0, NULL, GFP_NOWAIT, NULL);
+ }
+}
+
/**
* add_extent_mapping - add new extent map to the extent tree
* @tree: tree to insert new map in
@@ -352,11 +388,17 @@ int add_extent_mapping(struct extent_map_tree *tree,
{
int ret = 0;
+ lockdep_assert_held_write(&tree->lock);
+
ret = tree_insert(&tree->map, em);
if (ret)
goto out;
setup_extent_mapping(tree, em, modified);
+ if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) {
+ extent_map_device_set_bits(em, CHUNK_ALLOCATED);
+ extent_map_device_clear_bits(em, CHUNK_TRIMMED);
+ }
out:
return ret;
}
@@ -438,6 +480,8 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
rb_erase_cached(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
list_del_init(&em->list);
+ if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+ extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
RB_CLEAR_NODE(&em->rb_node);
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 473f039fcd7c..8e217337dff9 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -42,15 +42,8 @@ struct extent_map {
u64 block_len;
u64 generation;
unsigned long flags;
- union {
- struct block_device *bdev;
-
- /*
- * used for chunk mappings
- * flags & EXTENT_FLAG_FS_MAPPING must be set
- */
- struct map_lookup *map_lookup;
- };
+ /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
+ struct map_lookup *map_lookup;
refcount_t refs;
unsigned int compress_type;
struct list_head list;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index cccc75d15970..b1bfdc5c1387 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -8,6 +8,7 @@
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/sched/mm.h>
+#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -22,9 +23,13 @@
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
PAGE_SIZE))
-#define MAX_ORDERED_SUM_BYTES(fs_info) ((PAGE_SIZE - \
- sizeof(struct btrfs_ordered_sum)) / \
- sizeof(u32) * (fs_info)->sectorsize)
+static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
+ u16 csum_size)
+{
+ u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size;
+
+ return ncsums * fs_info->sectorsize;
+}
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -144,7 +149,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
}
static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u64 logical_offset, u32 *dst, int dio)
+ u64 logical_offset, u8 *dst, int dio)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct bio_vec bvec;
@@ -182,7 +187,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
}
csum = btrfs_bio->csum;
} else {
- csum = (u8 *)dst;
+ csum = dst;
}
if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
@@ -211,7 +216,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
if (!dio)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
- (u32 *)csum, nblocks);
+ csum, nblocks);
if (count)
goto found;
@@ -283,7 +288,8 @@ next:
return 0;
}
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst)
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
+ u8 *dst)
{
return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0);
}
@@ -374,7 +380,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_csum_item);
while (start < csum_end) {
size = min_t(size_t, csum_end - start,
- MAX_ORDERED_SUM_BYTES(fs_info));
+ max_ordered_sum_bytes(fs_info, csum_size));
sums = kzalloc(btrfs_ordered_sum_size(fs_info, size),
GFP_NOFS);
if (!sums) {
@@ -413,10 +419,21 @@ fail:
return ret;
}
+/*
+ * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio
+ * @inode: Owner of the data inside the bio
+ * @bio: Contains the data to be checksummed
+ * @file_start: offset in file this bio begins to describe
+ * @contig: Boolean. If true/1 means all bio vecs in this bio are
+ * contiguous and they begin at @file_start in the file. False/0
+ * means this bio can contains potentially discontigous bio vecs
+ * so the logical offset of each should be calculated separately.
+ */
blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
u64 file_start, int contig)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
char *data;
@@ -429,6 +446,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
int i;
u64 offset;
unsigned nofs_flag;
+ const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
nofs_flag = memalloc_nofs_save();
sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
@@ -449,6 +467,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
index = 0;
+ shash->tfm = fs_info->csum_shash;
+
bio_for_each_segment(bvec, bio, iter) {
if (!contig)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
@@ -458,8 +478,6 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
BUG_ON(!ordered); /* Logic error */
}
- data = kmap_atomic(bvec.bv_page);
-
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
bvec.bv_len + fs_info->sectorsize
- 1);
@@ -469,10 +487,9 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
offset < ordered->file_offset) {
unsigned long bytes_left;
- kunmap_atomic(data);
sums->len = this_sum_bytes;
this_sum_bytes = 0;
- btrfs_add_ordered_sum(inode, ordered, sums);
+ btrfs_add_ordered_sum(ordered, sums);
btrfs_put_ordered_extent(ordered);
bytes_left = bio->bi_iter.bi_size - total_bytes;
@@ -489,28 +506,24 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9)
+ total_bytes;
index = 0;
-
- data = kmap_atomic(bvec.bv_page);
}
- sums->sums[index] = ~(u32)0;
- sums->sums[index]
- = btrfs_csum_data(data + bvec.bv_offset
- + (i * fs_info->sectorsize),
- sums->sums[index],
- fs_info->sectorsize);
- btrfs_csum_final(sums->sums[index],
- (char *)(sums->sums + index));
- index++;
+ crypto_shash_init(shash);
+ data = kmap_atomic(bvec.bv_page);
+ crypto_shash_update(shash, data + bvec.bv_offset
+ + (i * fs_info->sectorsize),
+ fs_info->sectorsize);
+ kunmap_atomic(data);
+ crypto_shash_final(shash, (char *)(sums->sums + index));
+ index += csum_size;
offset += fs_info->sectorsize;
this_sum_bytes += fs_info->sectorsize;
total_bytes += fs_info->sectorsize;
}
- kunmap_atomic(data);
}
this_sum_bytes = 0;
- btrfs_add_ordered_sum(inode, ordered, sums);
+ btrfs_add_ordered_sum(ordered, sums);
btrfs_put_ordered_extent(ordered);
return 0;
}
@@ -551,7 +564,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
*/
u32 new_size = (bytenr - key->offset) >> blocksize_bits;
new_size *= csum_size;
- btrfs_truncate_item(fs_info, path, new_size, 1);
+ btrfs_truncate_item(path, new_size, 1);
} else if (key->offset >= bytenr && csum_end > end_byte &&
end_byte > key->offset) {
/*
@@ -563,7 +576,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
u32 new_size = (csum_end - end_byte) >> blocksize_bits;
new_size *= csum_size;
- btrfs_truncate_item(fs_info, path, new_size, 0);
+ btrfs_truncate_item(path, new_size, 0);
key->offset = end_byte;
btrfs_set_item_key_safe(fs_info, path, key);
@@ -577,9 +590,9 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
* range of bytes.
*/
int btrfs_del_csums(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr, u64 len)
+ struct btrfs_root *root, u64 bytenr, u64 len)
{
- struct btrfs_root *root = fs_info->csum_root;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path;
struct btrfs_key key;
u64 end_byte = bytenr + len;
@@ -589,6 +602,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int blocksize_bits = fs_info->sb->s_blocksize_bits;
+ ASSERT(root == fs_info->csum_root ||
+ root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -832,11 +848,11 @@ again:
u32 diff;
u32 free_space;
- if (btrfs_leaf_free_space(fs_info, leaf) <
+ if (btrfs_leaf_free_space(leaf) <
sizeof(struct btrfs_item) + csum_size * 2)
goto insert;
- free_space = btrfs_leaf_free_space(fs_info, leaf) -
+ free_space = btrfs_leaf_free_space(leaf) -
sizeof(struct btrfs_item) - csum_size;
tmp = sums->len - total_bytes;
tmp >>= fs_info->sb->s_blocksize_bits;
@@ -852,7 +868,7 @@ again:
diff /= csum_size;
diff *= csum_size;
- btrfs_extend_item(fs_info, path, diff);
+ btrfs_extend_item(path, diff);
ret = 0;
goto csum;
}
@@ -898,9 +914,9 @@ found:
write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
ins_size);
+ index += ins_size;
ins_size /= csum_size;
total_bytes += ins_size * fs_info->sectorsize;
- index += ins_size;
btrfs_mark_buffer_dirty(path->nodes[0]);
if (total_bytes < sums->len) {
@@ -932,7 +948,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
u8 type = btrfs_file_extent_type(leaf, fi);
int compress_type = btrfs_file_extent_compression(leaf, fi);
- em->bdev = fs_info->fs_devices->latest_bdev;
btrfs_item_key_to_cpu(leaf, &key, slot);
extent_start = key.offset;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 34fe8a58b0e9..8d47c76b7bd1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -26,6 +26,7 @@
#include "volumes.h"
#include "qgroup.h"
#include "compression.h"
+#include "delalloc-space.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@@ -295,7 +296,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
key.objectid = defrag->ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto cleanup;
@@ -536,8 +537,8 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
* we can set things up properly
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ 0, 0, cached);
if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
if (start_pos >= isize &&
@@ -558,7 +559,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
}
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
- extra_bits, cached, 0);
+ extra_bits, cached);
if (err)
return err;
@@ -666,7 +667,6 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
}
split->generation = gen;
- split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
replace_extent_mapping(em_tree, em, split, modified);
@@ -679,7 +679,6 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
split->start = start + len;
split->len = em->start + em->len - (start + len);
- split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
split->generation = gen;
@@ -754,6 +753,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
+ struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
u64 ino = btrfs_ino(BTRFS_I(inode));
@@ -909,11 +909,14 @@ next_slot:
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0) {
- ret = btrfs_inc_extent_ref(trans, root,
- disk_bytenr, num_bytes, 0,
+ btrfs_init_generic_ref(&ref,
+ BTRFS_ADD_DELAYED_REF,
+ disk_bytenr, num_bytes, 0);
+ btrfs_init_data_ref(&ref,
root->root_key.objectid,
new_key.objectid,
start - extent_offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
}
key.offset = start;
@@ -993,11 +996,14 @@ delete_extent_item:
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
} else if (update_refs && disk_bytenr > 0) {
- ret = btrfs_free_extent(trans, root,
- disk_bytenr, num_bytes, 0,
+ btrfs_init_generic_ref(&ref,
+ BTRFS_DROP_DELAYED_REF,
+ disk_bytenr, num_bytes, 0);
+ btrfs_init_data_ref(&ref,
root->root_key.objectid,
- key.objectid, key.offset -
- extent_offset);
+ key.objectid,
+ key.offset - extent_offset);
+ ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
extent_end - key.offset);
@@ -1025,7 +1031,7 @@ delete_extent_item:
continue;
}
- BUG_ON(1);
+ BUG();
}
if (!ret && del_nr > 0) {
@@ -1050,7 +1056,7 @@ delete_extent_item:
if (!ret && replace_extent && leafs_visited == 1 &&
(path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
path->locks[0] == BTRFS_WRITE_LOCK) &&
- btrfs_leaf_free_space(fs_info, leaf) >=
+ btrfs_leaf_free_space(leaf) >=
sizeof(struct btrfs_item) + extent_item_size) {
key.objectid = ino;
@@ -1142,6 +1148,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_file_extent_item *fi;
+ struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
u64 bytenr;
@@ -1287,9 +1294,11 @@ again:
extent_end - split);
btrfs_mark_buffer_dirty(leaf);
- ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
- 0, root->root_key.objectid,
- ino, orig_offset);
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
+ num_bytes, 0);
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
+ orig_offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -1311,6 +1320,9 @@ again:
other_start = end;
other_end = 0;
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
+ num_bytes, 0);
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
if (extent_mergeable(leaf, path->slots[0] + 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
@@ -1321,9 +1333,7 @@ again:
extent_end = other_end;
del_slot = path->slots[0] + 1;
del_nr++;
- ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
- 0, root->root_key.objectid,
- ino, orig_offset);
+ ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -1341,9 +1351,7 @@ again:
key.offset = other_start;
del_slot = path->slots[0];
del_nr++;
- ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
- 0, root->root_key.objectid,
- ino, orig_offset);
+ ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -1541,30 +1549,20 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
- struct btrfs_ordered_extent *ordered;
u64 lockstart, lockend;
u64 num_bytes;
int ret;
ret = btrfs_start_write_no_snapshotting(root);
if (!ret)
- return -ENOSPC;
+ return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
- while (1) {
- lock_extent(&inode->io_tree, lockstart, lockend);
- ordered = btrfs_lookup_ordered_range(inode, lockstart,
- lockend - lockstart + 1);
- if (!ordered) {
- break;
- }
- unlock_extent(&inode->io_tree, lockstart, lockend);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- }
+ btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart,
+ lockend, NULL);
num_bytes = lockend - lockstart + 1;
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
@@ -1591,7 +1589,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
- struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
u64 lockstart;
@@ -1611,6 +1608,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
return -ENOMEM;
while (iov_iter_count(i) > 0) {
+ struct extent_state *cached_state = NULL;
size_t offset = offset_in_page(pos);
size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
@@ -1636,6 +1634,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
break;
}
+ only_release_metadata = false;
sector_offset = pos & (fs_info->sectorsize - 1);
reserve_bytes = round_up(write_bytes + sector_offset,
fs_info->sectorsize);
@@ -1692,7 +1691,7 @@ again:
force_page_uptodate);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes, true);
+ reserve_bytes);
break;
}
@@ -1704,7 +1703,7 @@ again:
if (extents_locked == -EAGAIN)
goto again;
btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes, true);
+ reserve_bytes);
ret = extents_locked;
break;
}
@@ -1758,11 +1757,21 @@ again:
if (copied > 0)
ret = btrfs_dirty_pages(inode, pages, dirty_pages,
pos, copied, &cached_state);
+
+ /*
+ * If we have not locked the extent range, because the range's
+ * start offset is >= i_size, we might still have a non-NULL
+ * cached extent state, acquired while marking the extent range
+ * as delalloc through btrfs_dirty_pages(). Therefore free any
+ * possible cached extent state to avoid a memory leak.
+ */
if (extents_locked)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state);
- btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
- true);
+ else
+ free_extent_state(cached_state);
+
+ btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
btrfs_drop_pages(pages, num_pages);
break;
@@ -1781,7 +1790,6 @@ again:
set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, EXTENT_NORESERVE, NULL,
NULL, GFP_NOFS);
- only_release_metadata = false;
}
btrfs_drop_pages(pages, num_pages);
@@ -1882,10 +1890,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
u64 start_pos;
u64 end_pos;
ssize_t num_written = 0;
- bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
+ const bool sync = iocb->ki_flags & IOCB_DSYNC;
ssize_t err;
loff_t pos;
- size_t count = iov_iter_count(from);
+ size_t count;
loff_t oldsize;
int clean_page = 0;
@@ -1893,9 +1901,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
(iocb->ki_flags & IOCB_NOWAIT))
return -EOPNOTSUPP;
- if (!inode_trylock(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
return -EAGAIN;
+ } else {
inode_lock(inode);
}
@@ -1906,6 +1915,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
}
pos = iocb->ki_pos;
+ count = iov_iter_count(from);
if (iocb->ki_flags & IOCB_NOWAIT) {
/*
* We will allocate space in case nodatacow is not set,
@@ -2056,13 +2066,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
- u64 len;
- /*
- * The range length can be represented by u64, we have to do the typecasts
- * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
- */
- len = (u64)end - (u64)start + 1;
trace_btrfs_sync_file(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
@@ -2089,6 +2093,19 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
+ * If the inode needs a full sync, make sure we use a full range to
+ * avoid log tree corruption, due to hole detection racing with ordered
+ * extent completion for adjacent ranges, and assertion failures during
+ * hole detection. Do this while holding the inode lock, to avoid races
+ * with other tasks.
+ */
+ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags)) {
+ start = 0;
+ end = LLONG_MAX;
+ }
+
+ /*
* Before we acquired the inode's lock, someone may have dirtied more
* pages in the target range. We need to make sure that writeback for
* any such pages does not start while we are logging the inode, because
@@ -2115,8 +2132,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
/*
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
+ *
+ * Also, the range length can be represented by u64, we have to do the
+ * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
*/
- ret = btrfs_wait_ordered_range(inode, start, len);
+ ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
if (ret) {
up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
@@ -2165,7 +2185,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
inode_unlock(inode);
goto out;
}
- trans->sync = true;
ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
if (ret < 0) {
@@ -2339,7 +2358,6 @@ out:
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
- hole_em->bdev = fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
@@ -2428,27 +2446,286 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
return 0;
}
+static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_clone_extent_info *clone_info,
+ const u64 clone_len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *extent;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int slot;
+ struct btrfs_ref ref = { 0 };
+ u64 ref_offset;
+ int ret;
+
+ if (clone_len == 0)
+ return 0;
+
+ if (clone_info->disk_offset == 0 &&
+ btrfs_fs_incompat(fs_info, NO_HOLES))
+ return 0;
+
+ key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = clone_info->file_offset;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ clone_info->item_size);
+ if (ret)
+ return ret;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ write_extent_buffer(leaf, clone_info->extent_buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ clone_info->item_size);
+ extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset);
+ btrfs_set_file_extent_num_bytes(leaf, extent, clone_len);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ /* If it's a hole, nothing more needs to be done. */
+ if (clone_info->disk_offset == 0)
+ return 0;
+
+ inode_add_bytes(inode, clone_len);
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+ clone_info->disk_offset,
+ clone_info->disk_len, 0);
+ ref_offset = clone_info->file_offset - clone_info->data_offset;
+ btrfs_init_data_ref(&ref, root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)), ref_offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+
+ return ret;
+}
+
+/*
+ * The respective range must have been previously locked, as well as the inode.
+ * The end offset is inclusive (last byte of the range).
+ * @clone_info is NULL for fallocate's hole punching and non-NULL for extent
+ * cloning.
+ * When cloning, we don't want to end up in a state where we dropped extents
+ * without inserting a new one, so we must abort the transaction to avoid a
+ * corruption.
+ */
+int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+ const u64 start, const u64 end,
+ struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_trans_handle **trans_out)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
+ u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_block_rsv *rsv;
+ unsigned int rsv_count;
+ u64 cur_offset;
+ u64 drop_end;
+ u64 len = end - start;
+ int ret = 0;
+
+ if (end <= start)
+ return -EINVAL;
+
+ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
+ if (!rsv) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
+ rsv->failfast = 1;
+
+ /*
+ * 1 - update the inode
+ * 1 - removing the extents in the range
+ * 1 - adding the hole extent if no_holes isn't set or if we are cloning
+ * an extent
+ */
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info)
+ rsv_count = 3;
+ else
+ rsv_count = 2;
+
+ trans = btrfs_start_transaction(root, rsv_count);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out_free;
+ }
+
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+ min_size, false);
+ BUG_ON(ret);
+ trans->block_rsv = rsv;
+
+ cur_offset = start;
+ while (cur_offset < end) {
+ ret = __btrfs_drop_extents(trans, root, inode, path,
+ cur_offset, end + 1, &drop_end,
+ 1, 0, 0, NULL);
+ if (ret != -ENOSPC) {
+ /*
+ * When cloning we want to avoid transaction aborts when
+ * nothing was done and we are attempting to clone parts
+ * of inline extents, in such cases -EOPNOTSUPP is
+ * returned by __btrfs_drop_extents() without having
+ * changed anything in the file.
+ */
+ if (clone_info && ret && ret != -EOPNOTSUPP)
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+
+ trans->block_rsv = &fs_info->trans_block_rsv;
+
+ if (!clone_info && cur_offset < drop_end &&
+ cur_offset < ino_size) {
+ ret = fill_holes(trans, BTRFS_I(inode), path,
+ cur_offset, drop_end);
+ if (ret) {
+ /*
+ * If we failed then we didn't insert our hole
+ * entries for the area we dropped, so now the
+ * fs is corrupted, so we must abort the
+ * transaction.
+ */
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ }
+
+ if (clone_info && drop_end > clone_info->file_offset) {
+ u64 clone_len = drop_end - clone_info->file_offset;
+
+ ret = btrfs_insert_clone_extent(trans, inode, path,
+ clone_info, clone_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ clone_info->data_len -= clone_len;
+ clone_info->data_offset += clone_len;
+ clone_info->file_offset += clone_len;
+ }
+
+ cur_offset = drop_end;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ break;
+
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+
+ trans = btrfs_start_transaction(root, rsv_count);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
+ rsv, min_size, false);
+ BUG_ON(ret); /* shouldn't happen */
+ trans->block_rsv = rsv;
+
+ if (!clone_info) {
+ ret = find_first_non_hole(inode, &cur_offset, &len);
+ if (unlikely(ret < 0))
+ break;
+ if (ret && !len) {
+ ret = 0;
+ break;
+ }
+ }
+ }
+
+ /*
+ * If we were cloning, force the next fsync to be a full one since we
+ * we replaced (or just dropped in the case of cloning holes when
+ * NO_HOLES is enabled) extents and extent maps.
+ * This is for the sake of simplicity, and cloning into files larger
+ * than 16Mb would force the full fsync any way (when
+ * try_release_extent_mapping() is invoked during page cache truncation.
+ */
+ if (clone_info)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
+ if (ret)
+ goto out_trans;
+
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ /*
+ * If we are using the NO_HOLES feature we might have had already an
+ * hole that overlaps a part of the region [lockstart, lockend] and
+ * ends at (or beyond) lockend. Since we have no file extent items to
+ * represent holes, drop_end can be less than lockend and so we must
+ * make sure we have an extent map representing the existing hole (the
+ * call to __btrfs_drop_extents() might have dropped the existing extent
+ * map representing the existing hole), otherwise the fast fsync path
+ * will not record the existence of the hole region
+ * [existing_hole_start, lockend].
+ */
+ if (drop_end <= end)
+ drop_end = end + 1;
+ /*
+ * Don't insert file hole extent item if it's for a range beyond eof
+ * (because it's useless) or if it represents a 0 bytes range (when
+ * cur_offset == drop_end).
+ */
+ if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) {
+ ret = fill_holes(trans, BTRFS_I(inode), path,
+ cur_offset, drop_end);
+ if (ret) {
+ /* Same comment as above. */
+ btrfs_abort_transaction(trans, ret);
+ goto out_trans;
+ }
+ }
+ if (clone_info) {
+ ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
+ clone_info->data_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_trans;
+ }
+ }
+
+out_trans:
+ if (!trans)
+ goto out_free;
+
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ if (ret)
+ btrfs_end_transaction(trans);
+ else
+ *trans_out = trans;
+out_free:
+ btrfs_free_block_rsv(fs_info, rsv);
+out:
+ return ret;
+}
+
static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
- struct btrfs_block_rsv *rsv;
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
u64 lockstart;
u64 lockend;
u64 tail_start;
u64 tail_len;
u64 orig_start = offset;
- u64 cur_offset;
- u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1);
- u64 drop_end;
int ret = 0;
- int err = 0;
- unsigned int rsv_count;
bool same_block;
- bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES);
u64 ino_size;
bool truncated_block = false;
bool updated_inode = false;
@@ -2546,10 +2823,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
&cached_state);
- if (ret) {
- inode_unlock(inode);
+ if (ret)
goto out_only_mutex;
- }
path = btrfs_alloc_path();
if (!path) {
@@ -2557,145 +2832,24 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out;
}
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv) {
- ret = -ENOMEM;
- goto out_free;
- }
- rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1);
- rsv->failfast = 1;
-
- /*
- * 1 - update the inode
- * 1 - removing the extents in the range
- * 1 - adding the hole extent if no_holes isn't set
- */
- rsv_count = no_holes ? 2 : 3;
- trans = btrfs_start_transaction(root, rsv_count);
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
- goto out_free;
- }
-
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
- min_size, false);
- BUG_ON(ret);
- trans->block_rsv = rsv;
-
- cur_offset = lockstart;
- len = lockend - cur_offset;
- while (cur_offset < lockend) {
- ret = __btrfs_drop_extents(trans, root, inode, path,
- cur_offset, lockend + 1,
- &drop_end, 1, 0, 0, NULL);
- if (ret != -ENOSPC)
- break;
-
- trans->block_rsv = &fs_info->trans_block_rsv;
-
- if (cur_offset < drop_end && cur_offset < ino_size) {
- ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_end);
- if (ret) {
- /*
- * If we failed then we didn't insert our hole
- * entries for the area we dropped, so now the
- * fs is corrupted, so we must abort the
- * transaction.
- */
- btrfs_abort_transaction(trans, ret);
- err = ret;
- break;
- }
- }
-
- cur_offset = drop_end;
-
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- err = ret;
- break;
- }
-
- btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(fs_info);
-
- trans = btrfs_start_transaction(root, rsv_count);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- break;
- }
-
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, false);
- BUG_ON(ret); /* shouldn't happen */
- trans->block_rsv = rsv;
-
- ret = find_first_non_hole(inode, &cur_offset, &len);
- if (unlikely(ret < 0))
- break;
- if (ret && !len) {
- ret = 0;
- break;
- }
- }
-
- if (ret) {
- err = ret;
- goto out_trans;
- }
-
- trans->block_rsv = &fs_info->trans_block_rsv;
- /*
- * If we are using the NO_HOLES feature we might have had already an
- * hole that overlaps a part of the region [lockstart, lockend] and
- * ends at (or beyond) lockend. Since we have no file extent items to
- * represent holes, drop_end can be less than lockend and so we must
- * make sure we have an extent map representing the existing hole (the
- * call to __btrfs_drop_extents() might have dropped the existing extent
- * map representing the existing hole), otherwise the fast fsync path
- * will not record the existence of the hole region
- * [existing_hole_start, lockend].
- */
- if (drop_end <= lockend)
- drop_end = lockend + 1;
- /*
- * Don't insert file hole extent item if it's for a range beyond eof
- * (because it's useless) or if it represents a 0 bytes range (when
- * cur_offset == drop_end).
- */
- if (cur_offset < ino_size && cur_offset < drop_end) {
- ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_end);
- if (ret) {
- /* Same comment as above. */
- btrfs_abort_transaction(trans, ret);
- err = ret;
- goto out_trans;
- }
- }
-
-out_trans:
- if (!trans)
- goto out_free;
+ ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL,
+ &trans);
+ btrfs_free_path(path);
+ if (ret)
+ goto out;
+ ASSERT(trans != NULL);
inode_inc_iversion(inode);
inode->i_mtime = inode->i_ctime = current_time(inode);
-
- trans->block_rsv = &fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
updated_inode = true;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
-out_free:
- btrfs_free_path(path);
- btrfs_free_block_rsv(fs_info, rsv);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
out_only_mutex:
- if (!updated_inode && truncated_block && !ret && !err) {
+ if (!updated_inode && truncated_block && !ret) {
/*
* If we only end up zeroing part of a page, we still need to
* update the inode item, so that all the time fields are
@@ -2703,18 +2857,25 @@ out_only_mutex:
* for detecting, at fsync time, if the inode isn't yet in the
* log tree or it's there but not up to date.
*/
+ struct timespec64 now = current_time(inode);
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = now;
+ inode->i_ctime = now;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
} else {
- err = btrfs_update_inode(trans, root, inode);
- ret = btrfs_end_transaction(trans);
+ int ret2;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ ret2 = btrfs_end_transaction(trans);
+ if (!ret)
+ ret = ret2;
}
}
inode_unlock(inode);
- if (ret && !err)
- err = ret;
- return err;
+ return ret;
}
/* Helper structure to record which range is already reserved */
@@ -2783,9 +2944,9 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
}
enum {
- RANGE_BOUNDARY_WRITTEN_EXTENT = 0,
- RANGE_BOUNDARY_PREALLOC_EXTENT = 1,
- RANGE_BOUNDARY_HOLE = 2,
+ RANGE_BOUNDARY_WRITTEN_EXTENT,
+ RANGE_BOUNDARY_PREALLOC_EXTENT,
+ RANGE_BOUNDARY_HOLE,
};
static int btrfs_zero_range_check_range_boundary(struct inode *inode,
@@ -3132,6 +3293,7 @@ static long btrfs_fallocate(struct file *file, int mode,
ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
cur_offset, last_byte - cur_offset);
if (ret < 0) {
+ cur_offset = last_byte;
free_extent_map(em);
break;
}
@@ -3181,34 +3343,35 @@ out:
/* Let go of our reservation. */
if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
btrfs_free_reserved_data_space(inode, data_reserved,
- alloc_start, alloc_end - cur_offset);
+ cur_offset, alloc_end - cur_offset);
extent_changeset_free(data_reserved);
return ret;
}
-static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
+static loff_t find_desired_extent(struct inode *inode, loff_t offset,
+ int whence)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
+ loff_t i_size = inode->i_size;
u64 lockstart;
u64 lockend;
u64 start;
u64 len;
int ret = 0;
- if (inode->i_size == 0)
+ if (i_size == 0 || offset >= i_size)
return -ENXIO;
/*
- * *offset can be negative, in this case we start finding DATA/HOLE from
+ * offset can be negative, in this case we start finding DATA/HOLE from
* the very start of the file.
*/
- start = max_t(loff_t, 0, *offset);
+ start = max_t(loff_t, 0, offset);
lockstart = round_down(start, fs_info->sectorsize);
- lockend = round_up(i_size_read(inode),
- fs_info->sectorsize);
+ lockend = round_up(i_size, fs_info->sectorsize);
if (lockend <= lockstart)
lockend = lockstart + fs_info->sectorsize;
lockend--;
@@ -3217,7 +3380,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
- while (start < inode->i_size) {
+ while (start < i_size) {
em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -3240,46 +3403,39 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
cond_resched();
}
free_extent_map(em);
- if (!ret) {
- if (whence == SEEK_DATA && start >= inode->i_size)
- ret = -ENXIO;
- else
- *offset = min_t(loff_t, start, inode->i_size);
- }
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
- return ret;
+ if (ret) {
+ offset = ret;
+ } else {
+ if (whence == SEEK_DATA && start >= i_size)
+ offset = -ENXIO;
+ else
+ offset = min_t(loff_t, start, i_size);
+ }
+
+ return offset;
}
static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- int ret;
- inode_lock(inode);
switch (whence) {
- case SEEK_END:
- case SEEK_CUR:
- offset = generic_file_llseek(file, offset, whence);
- goto out;
+ default:
+ return generic_file_llseek(file, offset, whence);
case SEEK_DATA:
case SEEK_HOLE:
- if (offset >= i_size_read(inode)) {
- inode_unlock(inode);
- return -ENXIO;
- }
-
- ret = find_desired_extent(inode, &offset, whence);
- if (ret) {
- inode_unlock(inode);
- return ret;
- }
+ inode_lock_shared(inode);
+ offset = find_desired_extent(inode, offset, whence);
+ inode_unlock_shared(inode);
+ break;
}
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-out:
- inode_unlock(inode);
- return offset;
+ if (offset < 0)
+ return offset;
+
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
static int btrfs_file_open(struct inode *inode, struct file *filp)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 74aa552f4793..3283da419200 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,9 @@
#include "extent_io.h"
#include "inode-map.h"
#include "volumes.h"
+#include "space-info.h"
+#include "delalloc-space.h"
+#include "block-group.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_32K
@@ -75,7 +78,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
* sure NOFS is set to keep us from deadlocking.
*/
nofs_flag = memalloc_nofs_save();
- inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path);
+ inode = btrfs_iget_path(fs_info->sb, &location, root, path);
btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
@@ -88,10 +91,10 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
return inode;
}
-struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache
- *block_group, struct btrfs_path *path)
+struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct inode *inode = NULL;
u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
@@ -103,7 +106,7 @@ struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info,
return inode;
inode = __lookup_free_space_inode(fs_info->tree_root, path,
- block_group->key.objectid);
+ block_group->start);
if (IS_ERR(inode))
return inode;
@@ -185,20 +188,19 @@ static int __create_free_space_inode(struct btrfs_root *root,
return 0;
}
-int create_free_space_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+int create_free_space_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
int ret;
u64 ino;
- ret = btrfs_find_free_objectid(fs_info->tree_root, &ino);
+ ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
if (ret < 0)
return ret;
- return __create_free_space_inode(fs_info->tree_root, trans, path, ino,
- block_group->key.objectid);
+ return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
+ ino, block_group->start);
}
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
@@ -208,8 +210,8 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
int ret;
/* 1 for slack space, 1 for updating the inode */
- needed_bytes = btrfs_calc_trunc_metadata_size(fs_info, 1) +
- btrfs_calc_trans_metadata_size(fs_info, 1);
+ needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
+ btrfs_calc_metadata_size(fs_info, 1);
spin_lock(&rsv->lock);
if (rsv->reserved < needed_bytes)
@@ -221,7 +223,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
}
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -382,6 +384,12 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode
if (uptodate && !PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "free space cache page truncated");
+ io_ctl_drop_pages(io_ctl);
+ return -EIO;
+ }
if (!PageUptodate(page)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"error reading free space cache");
@@ -465,9 +473,8 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
- crc = btrfs_csum_data(io_ctl->orig + offset, crc,
- PAGE_SIZE - offset);
- btrfs_csum_final(crc, (u8 *)&crc);
+ crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+ btrfs_crc32c_final(crc, (u8 *)&crc);
io_ctl_unmap_page(io_ctl);
tmp = page_address(io_ctl->pages[0]);
tmp += index;
@@ -493,9 +500,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
val = *tmp;
io_ctl_map_page(io_ctl, 0);
- crc = btrfs_csum_data(io_ctl->orig + offset, crc,
- PAGE_SIZE - offset);
- btrfs_csum_final(crc, (u8 *)&crc);
+ crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+ btrfs_crc32c_final(crc, (u8 *)&crc);
if (val != crc) {
btrfs_err_rl(io_ctl->fs_info,
"csum mismatch on free space cache");
@@ -764,7 +770,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
} else {
ASSERT(num_bitmaps);
num_bitmaps--;
- e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
+ e->bitmap = kmem_cache_zalloc(
+ btrfs_free_space_bitmap_cachep, GFP_NOFS);
if (!e->bitmap) {
kmem_cache_free(
btrfs_free_space_cachep, e);
@@ -812,15 +819,15 @@ free_cache:
goto out;
}
-int load_free_space_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group)
+int load_free_space_cache(struct btrfs_block_group *block_group)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct inode *inode;
struct btrfs_path *path;
int ret = 0;
bool matched;
- u64 used = btrfs_block_group_used(&block_group->item);
+ u64 used = block_group->used;
/*
* If this block group has been marked to be cleared for one reason or
@@ -858,7 +865,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
* once created get their ->cached field set to BTRFS_CACHE_FINISHED so
* we will never try to read their inode item while the fs is mounted.
*/
- inode = lookup_free_space_inode(fs_info, block_group, path);
+ inode = lookup_free_space_inode(block_group, path);
if (IS_ERR(inode)) {
btrfs_free_path(path);
return 0;
@@ -874,13 +881,13 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
spin_unlock(&block_group->lock);
ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
- path, block_group->key.objectid);
+ path, block_group->start);
btrfs_free_path(path);
if (ret <= 0)
goto out;
spin_lock(&ctl->tree_lock);
- matched = (ctl->free_space == (block_group->key.offset - used -
+ matched = (ctl->free_space == (block_group->length - used -
block_group->bytes_super));
spin_unlock(&ctl->tree_lock);
@@ -888,7 +895,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
__btrfs_remove_free_space_cache(ctl);
btrfs_warn(fs_info,
"block group %llu has wrong amount of free space",
- block_group->key.objectid);
+ block_group->start);
ret = -1;
}
out:
@@ -901,7 +908,7 @@ out:
btrfs_warn(fs_info,
"failed to load free space cache for block group %llu, rebuilding it now",
- block_group->key.objectid);
+ block_group->start);
}
iput(inode);
@@ -911,7 +918,7 @@ out:
static noinline_for_stack
int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
struct btrfs_free_space_ctl *ctl,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
int *entries, int *bitmaps,
struct list_head *bitmap_list)
{
@@ -1004,7 +1011,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
+ EXTENT_DELALLOC, 0, 0, NULL);
goto fail;
}
leaf = path->nodes[0];
@@ -1016,9 +1023,8 @@ update_cache_item(struct btrfs_trans_handle *trans,
if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
found_key.offset != offset) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
- inode->i_size - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
- NULL);
+ inode->i_size - 1, EXTENT_DELALLOC, 0,
+ 0, NULL);
btrfs_release_path(path);
goto fail;
}
@@ -1039,9 +1045,8 @@ fail:
return -1;
}
-static noinline_for_stack int
-write_pinned_extent_entries(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group,
+static noinline_for_stack int write_pinned_extent_entries(
+ struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
int *entries)
{
@@ -1059,11 +1064,11 @@ write_pinned_extent_entries(struct btrfs_fs_info *fs_info,
* We shouldn't have switched the pinned extents yet so this is the
* right one
*/
- unpin = fs_info->pinned_extents;
+ unpin = block_group->fs_info->pinned_extents;
- start = block_group->key.objectid;
+ start = block_group->start;
- while (start < block_group->key.objectid + block_group->key.offset) {
+ while (start < block_group->start + block_group->length) {
ret = find_first_extent_bit(unpin, start,
&extent_start, &extent_end,
EXTENT_DIRTY, NULL);
@@ -1071,13 +1076,12 @@ write_pinned_extent_entries(struct btrfs_fs_info *fs_info,
return 0;
/* This pinned extent is out of our range */
- if (extent_start >= block_group->key.objectid +
- block_group->key.offset)
+ if (extent_start >= block_group->start + block_group->length)
return 0;
extent_start = max(extent_start, start);
- extent_end = min(block_group->key.objectid +
- block_group->key.offset, extent_end + 1);
+ extent_end = min(block_group->start + block_group->length,
+ extent_end + 1);
len = extent_end - extent_start;
*entries += 1;
@@ -1115,7 +1119,7 @@ static int flush_dirty_cache(struct inode *inode)
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
+ EXTENT_DELALLOC, 0, 0, NULL);
return ret;
}
@@ -1141,7 +1145,7 @@ cleanup_write_cache_enospc(struct inode *inode,
static int __btrfs_wait_cache_io(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
struct btrfs_path *path, u64 offset)
{
@@ -1168,7 +1172,7 @@ out:
#ifdef DEBUG
btrfs_err(root->fs_info,
"failed to write free space cache for block group %llu",
- block_group->key.objectid);
+ block_group->start);
#endif
}
}
@@ -1210,12 +1214,12 @@ static int btrfs_wait_cache_io_root(struct btrfs_root *root,
}
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
block_group, &block_group->io_ctl,
- path, block_group->key.objectid);
+ path, block_group->start);
}
/**
@@ -1231,11 +1235,10 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
*/
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
struct btrfs_trans_handle *trans)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_state *cached_state = NULL;
LIST_HEAD(bitmap_list);
int entries = 0;
@@ -1293,8 +1296,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* If this changes while we are working we'll get added back to
* the dirty list and redo it. No locking needed
*/
- ret = write_pinned_extent_entries(fs_info, block_group,
- io_ctl, &entries);
+ ret = write_pinned_extent_entries(block_group, io_ctl, &entries);
if (ret)
goto out_nospc_locked;
@@ -1370,11 +1372,11 @@ out_unlock:
goto out;
}
-int btrfs_write_out_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct inode *inode;
int ret = 0;
@@ -1386,7 +1388,7 @@ int btrfs_write_out_cache(struct btrfs_fs_info *fs_info,
}
spin_unlock(&block_group->lock);
- inode = lookup_free_space_inode(fs_info, block_group, path);
+ inode = lookup_free_space_inode(block_group, path);
if (IS_ERR(inode))
return 0;
@@ -1396,7 +1398,7 @@ int btrfs_write_out_cache(struct btrfs_fs_info *fs_info,
#ifdef DEBUG
btrfs_err(fs_info,
"failed to write free space cache for block group %llu",
- block_group->key.objectid);
+ block_group->start);
#endif
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_ERROR;
@@ -1649,11 +1651,11 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
- struct btrfs_block_group_cache *block_group = ctl->private;
+ struct btrfs_block_group *block_group = ctl->private;
u64 max_bytes;
u64 bitmap_bytes;
u64 extent_bytes;
- u64 size = block_group->key.offset;
+ u64 size = block_group->length;
u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
@@ -1884,7 +1886,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info)
{
unlink_free_space(ctl, bitmap_info);
- kfree(bitmap_info->bitmap);
+ kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
ctl->total_bitmaps--;
ctl->op->recalc_thresholds(ctl);
@@ -1993,7 +1995,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
- struct btrfs_block_group_cache *block_group = ctl->private;
+ struct btrfs_block_group *block_group = ctl->private;
struct btrfs_fs_info *fs_info = block_group->fs_info;
bool forced = false;
@@ -2030,7 +2032,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
* so allow those block groups to still be allowed to have a bitmap
* entry.
*/
- if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
+ if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
return false;
return true;
@@ -2045,7 +2047,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
struct btrfs_free_space *bitmap_info;
- struct btrfs_block_group_cache *block_group = NULL;
+ struct btrfs_block_group *block_group = NULL;
int added = 0;
u64 bytes, offset, bytes_added;
int ret;
@@ -2138,7 +2140,8 @@ new_bitmap:
}
/* allocate the bitmap */
- info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
+ info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
+ GFP_NOFS);
spin_lock(&ctl->tree_lock);
if (!info->bitmap) {
ret = -ENOMEM;
@@ -2149,7 +2152,9 @@ new_bitmap:
out:
if (info) {
- kfree(info->bitmap);
+ if (info->bitmap)
+ kmem_cache_free(btrfs_free_space_bitmap_cachep,
+ info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, info);
}
@@ -2379,7 +2384,15 @@ out:
return ret;
}
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+int btrfs_add_free_space(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size)
+{
+ return __btrfs_add_free_space(block_group->fs_info,
+ block_group->free_space_ctl,
+ bytenr, size);
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 offset, u64 bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -2469,7 +2482,7 @@ out:
return ret;
}
-void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+void btrfs_dump_free_space(struct btrfs_block_group *block_group,
u64 bytes)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -2494,14 +2507,14 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
"%d blocks of free space at or bigger than bytes is", count);
}
-void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
spin_lock_init(&ctl->tree_lock);
ctl->unit = fs_info->sectorsize;
- ctl->start = block_group->key.objectid;
+ ctl->start = block_group->start;
ctl->private = block_group;
ctl->op = &free_space_op;
INIT_LIST_HEAD(&ctl->trimming_ranges);
@@ -2523,7 +2536,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
*/
static int
__btrfs_return_cluster_to_free_space(
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -2589,7 +2602,7 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
spin_unlock(&ctl->tree_lock);
}
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_cluster *cluster;
@@ -2611,7 +2624,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
}
-u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size)
{
@@ -2665,7 +2678,7 @@ out:
* cluster and remove the cluster from it.
*/
int btrfs_return_cluster_to_free_space(
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl;
@@ -2699,7 +2712,7 @@ int btrfs_return_cluster_to_free_space(
return ret;
}
-static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
struct btrfs_free_space *entry,
u64 bytes, u64 min_start,
@@ -2732,7 +2745,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
* if it couldn't find anything suitably large, or a logical disk offset
* if things worked out
*/
-u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start, u64 *max_extent_size)
{
@@ -2805,7 +2818,8 @@ out:
if (entry->bytes == 0) {
ctl->free_extents--;
if (entry->bitmap) {
- kfree(entry->bitmap);
+ kmem_cache_free(btrfs_free_space_bitmap_cachep,
+ entry->bitmap);
ctl->total_bitmaps--;
ctl->op->recalc_thresholds(ctl);
}
@@ -2817,7 +2831,7 @@ out:
return ret;
}
-static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_space *entry,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes,
@@ -2899,7 +2913,7 @@ again:
* extent of cont1_bytes, and other clusters of at least min_bytes.
*/
static noinline int
-setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
+setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
u64 cont1_bytes, u64 min_bytes)
@@ -2990,7 +3004,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
* that we have already failed to find extents that will work.
*/
static noinline int
-setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
+setup_cluster_bitmap(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
u64 cont1_bytes, u64 min_bytes)
@@ -3040,11 +3054,11 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
* returns zero and sets up cluster if things worked out, otherwise
* it returns -enospc
*/
-int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group,
+int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes, u64 empty_size)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry, *tmp;
LIST_HEAD(bitmaps);
@@ -3131,7 +3145,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
cluster->block_group = NULL;
}
-static int do_trimming(struct btrfs_block_group_cache *block_group,
+static int do_trimming(struct btrfs_block_group *block_group,
u64 *total_trimmed, u64 start, u64 bytes,
u64 reserved_start, u64 reserved_bytes,
struct btrfs_trim_range *trim_entry)
@@ -3169,14 +3183,14 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
space_info->bytes_readonly += reserved_bytes;
block_group->reserved -= reserved_bytes;
space_info->bytes_reserved -= reserved_bytes;
- spin_unlock(&space_info->lock);
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
}
return ret;
}
-static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+static int trim_no_bitmap(struct btrfs_block_group *block_group,
u64 *total_trimmed, u64 start, u64 end, u64 minlen)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -3261,7 +3275,7 @@ out:
return ret;
}
-static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+static int trim_bitmaps(struct btrfs_block_group *block_group,
u64 *total_trimmed, u64 start, u64 end, u64 minlen)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -3342,12 +3356,12 @@ next:
return ret;
}
-void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
+void btrfs_get_block_group_trimming(struct btrfs_block_group *cache)
{
atomic_inc(&cache->trimming);
}
-void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
+void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct extent_map_tree *em_tree;
@@ -3361,15 +3375,11 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
if (cleanup) {
mutex_lock(&fs_info->chunk_mutex);
- em_tree = &fs_info->mapping_tree.map_tree;
+ em_tree = &fs_info->mapping_tree;
write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, block_group->key.objectid,
+ em = lookup_extent_mapping(em_tree, block_group->start,
1);
BUG_ON(!em); /* logic error, can't happen */
- /*
- * remove_extent_mapping() will delete us from the pinned_chunks
- * list, which is protected by the chunk mutex.
- */
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
mutex_unlock(&fs_info->chunk_mutex);
@@ -3386,7 +3396,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
}
}
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+int btrfs_trim_block_group(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen)
{
int ret;
@@ -3584,7 +3594,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
* how the free space cache loading stuff works, so you can get really weird
* configurations.
*/
-int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+int test_add_free_space_entry(struct btrfs_block_group *cache,
u64 offset, u64 bytes, bool bitmap)
{
struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
@@ -3613,7 +3623,7 @@ again:
}
if (!map) {
- map = kzalloc(PAGE_SIZE, GFP_NOFS);
+ map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
if (!map) {
kmem_cache_free(btrfs_free_space_cachep, info);
return -ENOMEM;
@@ -3642,7 +3652,8 @@ again:
if (info)
kmem_cache_free(btrfs_free_space_cachep, info);
- kfree(map);
+ if (map)
+ kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
return 0;
}
@@ -3651,7 +3662,7 @@ again:
* just used to check the absence of space, so if there is free space in the
* range at all we will return 1.
*/
-int test_check_exists(struct btrfs_block_group_cache *cache,
+int test_check_exists(struct btrfs_block_group *cache,
u64 offset, u64 bytes)
{
struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 15e30b93db0d..ba9a23241101 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -36,29 +36,37 @@ struct btrfs_free_space_op {
struct btrfs_free_space *info);
};
-struct btrfs_io_ctl;
+struct btrfs_io_ctl {
+ void *cur, *orig;
+ struct page *page;
+ struct page **pages;
+ struct btrfs_fs_info *fs_info;
+ struct inode *inode;
+ unsigned long size;
+ int index;
+ int num_pages;
+ int entries;
+ int bitmaps;
+ unsigned check_crcs:1;
+};
-struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache
- *block_group, struct btrfs_path *path);
-int create_free_space_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+int create_free_space_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct inode *inode);
-int load_free_space_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group);
+int load_free_space_cache(struct btrfs_block_group *block_group);
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
-int btrfs_write_out_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
struct inode *lookup_free_ino_inode(struct btrfs_root *root,
struct btrfs_path *path);
@@ -72,49 +80,40 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
struct btrfs_path *path,
struct inode *inode);
-void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group);
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_free_space_ctl *ctl,
u64 bytenr, u64 size);
-static inline int
-btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
- u64 bytenr, u64 size)
-{
- return __btrfs_add_free_space(block_group->fs_info,
- block_group->free_space_ctl,
- bytenr, size);
-}
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+int btrfs_add_free_space(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size);
+int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
- *block_group);
-u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size);
u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
-void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+void btrfs_dump_free_space(struct btrfs_block_group *block_group,
u64 bytes);
-int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group,
+int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes, u64 empty_size);
void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
-u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start, u64 *max_extent_size);
int btrfs_return_cluster_to_free_space(
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster);
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+int btrfs_trim_block_group(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen);
/* Support functions for running our sanity tests */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+int test_add_free_space_entry(struct btrfs_block_group *cache,
u64 offset, u64 bytes, bool bitmap);
-int test_check_exists(struct btrfs_block_group_cache *cache,
- u64 offset, u64 bytes);
+int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes);
#endif
#endif
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index e5089087eaa6..258cb3fae17a 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -10,12 +10,13 @@
#include "locking.h"
#include "free-space-tree.h"
#include "transaction.h"
+#include "block-group.h"
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
-void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
+void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
{
u32 bitmap_range;
size_t bitmap_size;
@@ -26,8 +27,7 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
* exceeds that required for using bitmaps.
*/
bitmap_range = cache->fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
- num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1,
- bitmap_range);
+ num_bitmaps = div_u64(cache->length + bitmap_range - 1, bitmap_range);
bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE;
total_bitmap_size = num_bitmaps * bitmap_size;
cache->bitmap_high_thresh = div_u64(total_bitmap_size,
@@ -44,7 +44,7 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
}
static int add_new_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_root *root = trans->fs_info->free_space_root;
@@ -53,9 +53,9 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int ret;
- key.objectid = block_group->key.objectid;
+ key.objectid = block_group->start;
key.type = BTRFS_FREE_SPACE_INFO_KEY;
- key.offset = block_group->key.offset;
+ key.offset = block_group->length;
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
if (ret)
@@ -76,24 +76,25 @@ out:
EXPORT_FOR_TESTS
struct btrfs_free_space_info *search_free_space_info(
- struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root = fs_info->free_space_root;
struct btrfs_key key;
int ret;
- key.objectid = block_group->key.objectid;
+ key.objectid = block_group->start;
key.type = BTRFS_FREE_SPACE_INFO_KEY;
- key.offset = block_group->key.offset;
+ key.offset = block_group->length;
ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
if (ret < 0)
return ERR_PTR(ret);
if (ret != 0) {
btrfs_warn(fs_info, "missing free space info for %llu",
- block_group->key.objectid);
+ block_group->start);
ASSERT(0);
return ERR_PTR(-ENOENT);
}
@@ -178,7 +179,7 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
EXPORT_FOR_TESTS
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -195,7 +196,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
int done = 0, nr;
int ret;
- bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ bitmap_size = free_space_bitmap_size(block_group->length,
fs_info->sectorsize);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
@@ -203,8 +204,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
goto out;
}
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
+ start = block_group->start;
+ end = block_group->start + block_group->length;
key.objectid = end - 1;
key.type = (u8)-1;
@@ -222,8 +223,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
- ASSERT(found_key.objectid == block_group->key.objectid);
- ASSERT(found_key.offset == block_group->key.offset);
+ ASSERT(found_key.objectid == block_group->start);
+ ASSERT(found_key.offset == block_group->length);
done = 1;
break;
} else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
@@ -253,7 +254,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
}
- info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ info = search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
goto out;
@@ -269,7 +270,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
- block_group->key.objectid, extent_count,
+ block_group->start, extent_count,
expected_extent_count);
ASSERT(0);
ret = -EIO;
@@ -318,7 +319,7 @@ out:
EXPORT_FOR_TESTS
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -334,7 +335,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
int done = 0, nr;
int ret;
- bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ bitmap_size = free_space_bitmap_size(block_group->length,
fs_info->sectorsize);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
@@ -342,8 +343,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
goto out;
}
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
+ start = block_group->start;
+ end = block_group->start + block_group->length;
key.objectid = end - 1;
key.type = (u8)-1;
@@ -361,8 +362,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
- ASSERT(found_key.objectid == block_group->key.objectid);
- ASSERT(found_key.offset == block_group->key.offset);
+ ASSERT(found_key.objectid == block_group->start);
+ ASSERT(found_key.offset == block_group->length);
done = 1;
break;
} else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
@@ -398,7 +399,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
}
- info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ info = search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
goto out;
@@ -411,7 +412,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- nrbits = div_u64(block_group->key.offset, block_group->fs_info->sectorsize);
+ nrbits = div_u64(block_group->length, block_group->fs_info->sectorsize);
start_bit = find_next_bit_le(bitmap, nrbits, 0);
while (start_bit < nrbits) {
@@ -435,7 +436,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
- block_group->key.objectid, extent_count,
+ block_group->start, extent_count,
expected_extent_count);
ASSERT(0);
ret = -EIO;
@@ -451,7 +452,7 @@ out:
}
static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path,
int new_extents)
{
@@ -463,8 +464,7 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
if (new_extents == 0)
return 0;
- info = search_free_space_info(trans, trans->fs_info, block_group, path,
- 1);
+ info = search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
goto out;
@@ -490,7 +490,7 @@ out:
}
EXPORT_FOR_TESTS
-int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+int free_space_test_bit(struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 offset)
{
struct extent_buffer *leaf;
@@ -512,7 +512,7 @@ int free_space_test_bit(struct btrfs_block_group_cache *block_group,
return !!extent_buffer_test_bit(leaf, ptr, i);
}
-static void free_space_set_bits(struct btrfs_block_group_cache *block_group,
+static void free_space_set_bits(struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 *start, u64 *size,
int bit)
{
@@ -580,7 +580,7 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
* the bitmap.
*/
static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path,
u64 start, u64 size, int remove)
{
@@ -596,7 +596,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
* Read the bit for the block immediately before the extent of space if
* that block is within the block group.
*/
- if (start > block_group->key.objectid) {
+ if (start > block_group->start) {
u64 prev_block = start - block_group->fs_info->sectorsize;
key.objectid = prev_block;
@@ -648,7 +648,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
* Read the bit for the block immediately after the extent of space if
* that block is within the block group.
*/
- if (end < block_group->key.objectid + block_group->key.offset) {
+ if (end < block_group->start + block_group->length) {
/* The next block may be in the next bitmap. */
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (end >= key.objectid + key.offset) {
@@ -693,7 +693,7 @@ out:
}
static int remove_free_space_extent(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path,
u64 start, u64 size)
{
@@ -780,7 +780,7 @@ out:
EXPORT_FOR_TESTS
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size)
{
struct btrfs_free_space_info *info;
@@ -793,8 +793,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
return ret;
}
- info = search_free_space_info(NULL, trans->fs_info, block_group, path,
- 0);
+ info = search_free_space_info(NULL, block_group, path, 0);
if (IS_ERR(info))
return PTR_ERR(info);
flags = btrfs_free_space_flags(path->nodes[0], info);
@@ -812,7 +811,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_path *path;
int ret;
@@ -846,7 +845,7 @@ out:
}
static int add_free_space_extent(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path,
u64 start, u64 size)
{
@@ -880,7 +879,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
new_key.offset = size;
/* Search for a neighbor on the left. */
- if (start == block_group->key.objectid)
+ if (start == block_group->start)
goto right;
key.objectid = start - 1;
key.type = (u8)-1;
@@ -900,8 +899,8 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
found_start = key.objectid;
found_end = key.objectid + key.offset;
- ASSERT(found_start >= block_group->key.objectid &&
- found_end > block_group->key.objectid);
+ ASSERT(found_start >= block_group->start &&
+ found_end > block_group->start);
ASSERT(found_start < start && found_end <= start);
/*
@@ -920,7 +919,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
right:
/* Search for a neighbor on the right. */
- if (end == block_group->key.objectid + block_group->key.offset)
+ if (end == block_group->start + block_group->length)
goto insert;
key.objectid = end;
key.type = (u8)-1;
@@ -940,8 +939,8 @@ right:
found_start = key.objectid;
found_end = key.objectid + key.offset;
- ASSERT(found_start >= block_group->key.objectid &&
- found_end > block_group->key.objectid);
+ ASSERT(found_start >= block_group->start &&
+ found_end > block_group->start);
ASSERT((found_start < start && found_end <= start) ||
(found_start >= end && found_end > end));
@@ -974,10 +973,9 @@ out:
EXPORT_FOR_TESTS
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_free_space_info *info;
u32 flags;
int ret;
@@ -988,7 +986,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
return ret;
}
- info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ info = search_free_space_info(NULL, block_group, path, 0);
if (IS_ERR(info))
return PTR_ERR(info);
flags = btrfs_free_space_flags(path->nodes[0], info);
@@ -1006,7 +1004,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
int add_to_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_path *path;
int ret;
@@ -1044,7 +1042,7 @@ out:
* through the normal add/remove hooks.
*/
static int populate_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct btrfs_root *extent_root = trans->fs_info->extent_root;
struct btrfs_path *path, *path2;
@@ -1076,7 +1074,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
* BLOCK_GROUP_ITEM, so an extent may precede the block group that it's
* contained in.
*/
- key.objectid = block_group->key.objectid;
+ key.objectid = block_group->start;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = 0;
@@ -1085,8 +1083,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
goto out_locked;
ASSERT(ret == 0);
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
+ start = block_group->start;
+ end = block_group->start + block_group->length;
while (1) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -1110,7 +1108,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
else
start += key.offset;
} else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- if (key.objectid != block_group->key.objectid)
+ if (key.objectid != block_group->start)
break;
}
@@ -1141,7 +1139,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *free_space_root;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct rb_node *node;
int ret;
@@ -1150,7 +1148,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
return PTR_ERR(trans);
set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
- free_space_root = btrfs_create_tree(trans, fs_info,
+ free_space_root = btrfs_create_tree(trans,
BTRFS_FREE_SPACE_TREE_OBJECTID);
if (IS_ERR(free_space_root)) {
ret = PTR_ERR(free_space_root);
@@ -1160,7 +1158,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
node = rb_first(&fs_info->block_group_cache_tree);
while (node) {
- block_group = rb_entry(node, struct btrfs_block_group_cache,
+ block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
if (ret)
@@ -1248,7 +1246,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
list_del(&free_space_root->dirty_list);
btrfs_tree_lock(free_space_root->node);
- clean_tree_block(fs_info, free_space_root->node);
+ btrfs_clean_tree_block(free_space_root->node);
btrfs_tree_unlock(free_space_root->node);
btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
0, 1);
@@ -1266,7 +1264,7 @@ abort:
}
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
int ret;
@@ -1278,12 +1276,12 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
return ret;
return __add_to_free_space_tree(trans, block_group, path,
- block_group->key.objectid,
- block_group->key.offset);
+ block_group->start,
+ block_group->length);
}
int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path = NULL;
@@ -1313,7 +1311,7 @@ out:
}
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct btrfs_root *root = trans->fs_info->free_space_root;
struct btrfs_path *path;
@@ -1337,8 +1335,8 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
goto out;
}
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
+ start = block_group->start;
+ end = block_group->start + block_group->length;
key.objectid = end - 1;
key.type = (u8)-1;
@@ -1356,8 +1354,8 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
- ASSERT(found_key.objectid == block_group->key.objectid);
- ASSERT(found_key.offset == block_group->key.offset);
+ ASSERT(found_key.objectid == block_group->start);
+ ASSERT(found_key.offset == block_group->length);
done = 1;
nr++;
path->slots[0]--;
@@ -1392,7 +1390,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
struct btrfs_path *path,
u32 expected_extent_count)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
@@ -1408,7 +1406,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
fs_info = block_group->fs_info;
root = fs_info->free_space_root;
- end = block_group->key.objectid + block_group->key.offset;
+ end = block_group->start + block_group->length;
while (1) {
ret = btrfs_next_item(root, path);
@@ -1455,7 +1453,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
- block_group->key.objectid, extent_count,
+ block_group->start, extent_count,
expected_extent_count);
ASSERT(0);
ret = -EIO;
@@ -1473,7 +1471,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
struct btrfs_path *path,
u32 expected_extent_count)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
@@ -1486,7 +1484,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
fs_info = block_group->fs_info;
root = fs_info->free_space_root;
- end = block_group->key.objectid + block_group->key.offset;
+ end = block_group->start + block_group->length;
while (1) {
ret = btrfs_next_item(root, path);
@@ -1517,7 +1515,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
- block_group->key.objectid, extent_count,
+ block_group->start, extent_count,
expected_extent_count);
ASSERT(0);
ret = -EIO;
@@ -1533,15 +1531,13 @@ out:
int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
{
- struct btrfs_block_group_cache *block_group;
- struct btrfs_fs_info *fs_info;
+ struct btrfs_block_group *block_group;
struct btrfs_free_space_info *info;
struct btrfs_path *path;
u32 extent_count, flags;
int ret;
block_group = caching_ctl->block_group;
- fs_info = block_group->fs_info;
path = btrfs_alloc_path();
if (!path)
@@ -1555,7 +1551,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
path->search_commit_root = 1;
path->reada = READA_FORWARD;
- info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ info = search_free_space_info(NULL, block_group, path, 0);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
goto out;
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 3133651d7d70..dc2463e4cfe3 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_FREE_SPACE_TREE_H
#define BTRFS_FREE_SPACE_TREE_H
+struct btrfs_caching_control;
+
/*
* The default size for new free space bitmap items. The last bitmap in a block
* group may be truncated, and none of the free space tree code assumes that
@@ -14,14 +16,14 @@
#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
-void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group);
+void set_free_space_tree_thresholds(struct btrfs_block_group *block_group);
int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group);
+ struct btrfs_block_group *block_group);
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group);
+ struct btrfs_block_group *block_group);
int add_to_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size);
int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
@@ -30,22 +32,21 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_free_space_info *
search_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow);
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size);
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size);
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
-int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+int free_space_test_bit(struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 offset);
#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a8956a3c9e05..668701832845 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -8,9 +8,9 @@
#include "transaction.h"
#include "print-tree.h"
-int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot,
- const char *name,
- int name_len, struct btrfs_inode_ref **ref_ret)
+struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+ int slot, const char *name,
+ int name_len)
{
struct btrfs_inode_ref *ref;
unsigned long ptr;
@@ -28,19 +28,15 @@ int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot,
cur_offset += len + sizeof(*ref);
if (len != name_len)
continue;
- if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
- if (ref_ret)
- *ref_ret = ref;
- return 1;
- }
+ if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+ return ref;
}
- return 0;
+ return NULL;
}
-int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot,
- u64 ref_objectid,
- const char *name, int name_len,
- struct btrfs_inode_extref **extref_ret)
+struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
+ struct extent_buffer *leaf, int slot, u64 ref_objectid,
+ const char *name, int name_len)
{
struct btrfs_inode_extref *extref;
unsigned long ptr;
@@ -65,15 +61,12 @@ int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot,
if (ref_name_len == name_len &&
btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
- (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
- if (extref_ret)
- *extref_ret = extref;
- return 1;
- }
+ (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0))
+ return extref;
cur_offset += ref_name_len + sizeof(*extref);
}
- return 0;
+ return NULL;
}
/* Returns NULL if no extref found */
@@ -87,7 +80,6 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_key key;
- struct btrfs_inode_extref *extref;
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
@@ -98,11 +90,9 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
return ERR_PTR(ret);
if (ret > 0)
return NULL;
- if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
- ref_objectid, name, name_len,
- &extref))
- return NULL;
- return extref;
+ return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
+ ref_objectid, name, name_len);
+
}
static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
@@ -142,9 +132,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
* This should always succeed so error here will make the FS
* readonly.
*/
- if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
- ref_objectid,
- name, name_len, &extref)) {
+ extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
+ ref_objectid, name, name_len);
+ if (!extref) {
btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
ret = -EROFS;
goto out;
@@ -170,7 +160,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
memmove_extent_buffer(leaf, ptr, ptr + del_len,
item_size - (ptr + del_len - item_start));
- btrfs_truncate_item(root->fs_info, path, item_size - del_len, 1);
+ btrfs_truncate_item(path, item_size - del_len, 1);
out:
btrfs_free_path(path);
@@ -213,8 +203,10 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
} else if (ret < 0) {
goto out;
}
- if (!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
- name, name_len, &ref)) {
+
+ ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name,
+ name_len);
+ if (!ref) {
ret = -ENOENT;
search_ext_refs = 1;
goto out;
@@ -234,7 +226,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
- btrfs_truncate_item(root->fs_info, path, item_size - sub_item_len, 1);
+ btrfs_truncate_item(path, item_size - sub_item_len, 1);
out:
btrfs_free_path(path);
@@ -285,10 +277,10 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
if (btrfs_find_name_in_ext_backref(path->nodes[0],
path->slots[0],
ref_objectid,
- name, name_len, NULL))
+ name, name_len))
goto out;
- btrfs_extend_item(root->fs_info, path, ins_len);
+ btrfs_extend_item(path, ins_len);
ret = 0;
}
if (ret < 0)
@@ -341,13 +333,13 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ins_len);
if (ret == -EEXIST) {
u32 old_size;
-
- if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
- name, name_len, &ref))
+ ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+ name, name_len);
+ if (ref)
goto out;
old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- btrfs_extend_item(fs_info, path, ins_len);
+ btrfs_extend_item(path, ins_len);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
@@ -359,7 +351,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (ret == -EOVERFLOW) {
if (btrfs_find_name_in_backref(path->nodes[0],
path->slots[0],
- name, name_len, &ref))
+ name, name_len))
ret = -EEXIST;
else
ret = -EMLINK;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ffca2abf13d0..37345fb6191d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -11,6 +11,20 @@
#include "free-space-cache.h"
#include "inode-map.h"
#include "transaction.h"
+#include "delalloc-space.h"
+
+static void fail_caching_thread(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ btrfs_warn(fs_info, "failed to start inode caching task");
+ btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE,
+ "disabling inode map caching");
+ spin_lock(&root->ino_cache_lock);
+ root->ino_cache_state = BTRFS_CACHE_ERROR;
+ spin_unlock(&root->ino_cache_lock);
+ wake_up(&root->ino_cache_wait);
+}
static int caching_kthread(void *data)
{
@@ -28,8 +42,10 @@ static int caching_kthread(void *data)
return 0;
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ fail_caching_thread(root);
return -ENOMEM;
+ }
/* Since the commit root is read-only, we can safely skip locking. */
path->skip_locking = 1;
@@ -145,6 +161,7 @@ static void start_caching(struct btrfs_root *root)
spin_lock(&root->ino_cache_lock);
root->ino_cache_state = BTRFS_CACHE_FINISHED;
spin_unlock(&root->ino_cache_lock);
+ wake_up(&root->ino_cache_wait);
return;
}
@@ -159,15 +176,13 @@ static void start_caching(struct btrfs_root *root)
if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
__btrfs_add_free_space(fs_info, ctl, objectid,
BTRFS_LAST_FREE_OBJECTID - objectid + 1);
+ wake_up(&root->ino_cache_wait);
}
tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
root->root_key.objectid);
- if (IS_ERR(tsk)) {
- btrfs_warn(fs_info, "failed to start inode caching task");
- btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE,
- "disabling inode map caching");
- }
+ if (IS_ERR(tsk))
+ fail_caching_thread(root);
}
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
@@ -185,11 +200,14 @@ again:
wait_event(root->ino_cache_wait,
root->ino_cache_state == BTRFS_CACHE_FINISHED ||
+ root->ino_cache_state == BTRFS_CACHE_ERROR ||
root->free_ino_ctl->free_space > 0);
if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
root->free_ino_ctl->free_space == 0)
return -ENOSPC;
+ else if (root->ino_cache_state == BTRFS_CACHE_ERROR)
+ return btrfs_find_free_objectid(root, objectid);
else
goto again;
}
@@ -418,7 +436,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
* 1 item for free space object
* 3 items for pre-allocation
*/
- trans->bytes_reserved = btrfs_calc_trans_metadata_size(fs_info, 10);
+ trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10);
ret = btrfs_block_rsv_add(root, trans->block_rsv,
trans->bytes_reserved,
BTRFS_RESERVE_NO_FLUSH);
@@ -483,12 +501,13 @@ again:
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true);
goto out_put;
}
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
out_put:
iput(inode);
out_release:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2973608824ec..c70baafb2a39 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -28,7 +28,9 @@
#include <linux/magic.h>
#include <linux/iversion.h>
#include <linux/swap.h>
+#include <linux/sched/mm.h>
#include <asm/unaligned.h>
+#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -45,7 +47,8 @@
#include "backref.h"
#include "props.h"
#include "qgroup.h"
-#include "dedupe.h"
+#include "delalloc-space.h"
+#include "block-group.h"
struct btrfs_iget_args {
struct btrfs_key *location;
@@ -72,26 +75,15 @@ static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_path_cachep;
struct kmem_cache *btrfs_free_space_cachep;
-
-#define S_SHIFT 12
-static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
- [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
-};
+struct kmem_cache *btrfs_free_space_bitmap_cachep;
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
- u64 start, u64 end, u64 delalloc_end,
- int *page_started, unsigned long *nr_written,
- int unlock, struct btrfs_dedupe_hash *hash);
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written, int unlock);
static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
@@ -187,6 +179,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t cur_size = size;
unsigned long offset;
+ ASSERT((compressed_size > 0 && compressed_pages) ||
+ (compressed_size == 0 && !compressed_pages));
+
if (compressed_size && compressed_pages)
cur_size = compressed_size;
@@ -366,18 +361,25 @@ struct async_extent {
struct list_head list;
};
-struct async_cow {
+struct async_chunk {
struct inode *inode;
- struct btrfs_fs_info *fs_info;
struct page *locked_page;
u64 start;
u64 end;
unsigned int write_flags;
struct list_head extents;
+ struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
+ atomic_t *pending;
};
-static noinline int add_async_extent(struct async_cow *cow,
+struct async_cow {
+ /* Number of chunks in flight; must be first in the structure */
+ atomic_t num_chunks;
+ struct async_chunk chunks[];
+};
+
+static noinline int add_async_extent(struct async_chunk *cow,
u64 start, u64 ram_size,
u64 compressed_size,
struct page **pages,
@@ -398,10 +400,31 @@ static noinline int add_async_extent(struct async_cow *cow,
return 0;
}
+/*
+ * Check if the inode has flags compatible with compression
+ */
+static inline bool inode_can_compress(struct inode *inode)
+{
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
+ BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ return false;
+ return true;
+}
+
+/*
+ * Check if the inode needs to be submitted to compression, based on mount
+ * options, defragmentation, properties or heuristics.
+ */
static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ if (!inode_can_compress(inode)) {
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+ KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
+ btrfs_ino(BTRFS_I(inode)));
+ return 0;
+ }
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
@@ -444,15 +467,15 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
* are written in the same order that the flusher thread sent them
* down.
*/
-static noinline void compress_file_range(struct inode *inode,
- struct page *locked_page,
- u64 start, u64 end,
- struct async_cow *async_cow,
- int *num_added)
+static noinline int compress_file_range(struct async_chunk *async_chunk)
{
+ struct inode *inode = async_chunk->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 blocksize = fs_info->sectorsize;
+ u64 start = async_chunk->start;
+ u64 end = async_chunk->end;
u64 actual_end;
+ u64 i_size;
int ret = 0;
struct page **pages = NULL;
unsigned long nr_pages;
@@ -461,12 +484,25 @@ static noinline void compress_file_range(struct inode *inode,
int i;
int will_compress;
int compress_type = fs_info->compress_type;
+ int compressed_extents = 0;
int redirty = 0;
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
SZ_16K);
- actual_end = min_t(u64, i_size_read(inode), end + 1);
+ /*
+ * We need to save i_size before now because it could change in between
+ * us evaluating the size and assigning it. This is because we lock and
+ * unlock the page in truncate and fallocate, and then modify the i_size
+ * later on.
+ *
+ * The barriers are to emulate READ_ONCE, remove that once i_size_read
+ * does that for us.
+ */
+ barrier();
+ i_size = i_size_read(inode);
+ barrier();
+ actual_end = min_t(u64, i_size, end + 1);
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@ -597,14 +633,21 @@ cont:
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(inode, start, end, end,
- NULL, clear_flags,
+ extent_clear_unlock_delalloc(inode, start, end, NULL,
+ clear_flags,
PAGE_UNLOCK |
PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
- goto free_pages_out;
+
+ for (i = 0; i < nr_pages; i++) {
+ WARN_ON(pages[i]->mapping);
+ put_page(pages[i]);
+ }
+ kfree(pages);
+
+ return 0;
}
}
@@ -623,14 +666,14 @@ cont:
*/
total_in = ALIGN(total_in, PAGE_SIZE);
if (total_compressed + blocksize <= total_in) {
- *num_added += 1;
+ compressed_extents++;
/*
* The async work queues will take care of doing actual
* allocation on disk for these compressed pages, and
* will submit them to the elevator.
*/
- add_async_extent(async_cow, start, total_in,
+ add_async_extent(async_chunk, start, total_in,
total_compressed, pages, nr_pages,
compress_type);
@@ -640,7 +683,7 @@ cont:
cond_resched();
goto again;
}
- return;
+ return compressed_extents;
}
}
if (pages) {
@@ -670,25 +713,20 @@ cleanup_and_bail_uncompressed:
* to our extent and set things up for the async work queue to run
* cow_file_range to do the normal delalloc dance.
*/
- if (page_offset(locked_page) >= start &&
- page_offset(locked_page) <= end)
- __set_page_dirty_nobuffers(locked_page);
+ if (async_chunk->locked_page &&
+ (page_offset(async_chunk->locked_page) >= start &&
+ page_offset(async_chunk->locked_page)) <= end) {
+ __set_page_dirty_nobuffers(async_chunk->locked_page);
/* unlocked later on in the async handlers */
+ }
if (redirty)
extent_range_redirty_for_io(inode, start, end);
- add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
+ add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
BTRFS_COMPRESS_NONE);
- *num_added += 1;
+ compressed_extents++;
- return;
-
-free_pages_out:
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
- }
- kfree(pages);
+ return compressed_extents;
}
static void free_async_extent_pages(struct async_extent *async_extent)
@@ -713,45 +751,38 @@ static void free_async_extent_pages(struct async_extent *async_extent)
* queued. We walk all the async extents created by compress_file_range
* and send them down to the disk.
*/
-static noinline void submit_compressed_extents(struct async_cow *async_cow)
+static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
{
- struct inode *inode = async_cow->inode;
+ struct inode *inode = async_chunk->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct async_extent *async_extent;
u64 alloc_hint = 0;
struct btrfs_key ins;
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
int ret = 0;
again:
- while (!list_empty(&async_cow->extents)) {
- async_extent = list_entry(async_cow->extents.next,
+ while (!list_empty(&async_chunk->extents)) {
+ async_extent = list_entry(async_chunk->extents.next,
struct async_extent, list);
list_del(&async_extent->list);
- io_tree = &BTRFS_I(inode)->io_tree;
-
retry:
+ lock_extent(io_tree, async_extent->start,
+ async_extent->start + async_extent->ram_size - 1);
/* did the compression code fall back to uncompressed IO? */
if (!async_extent->pages) {
int page_started = 0;
unsigned long nr_written = 0;
- lock_extent(io_tree, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
-
/* allocate blocks */
- ret = cow_file_range(inode, async_cow->locked_page,
+ ret = cow_file_range(inode, async_chunk->locked_page,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
- async_extent->start +
- async_extent->ram_size - 1,
- &page_started, &nr_written, 0,
- NULL);
+ &page_started, &nr_written, 0);
/* JDM XXX */
@@ -767,16 +798,13 @@ retry:
async_extent->start +
async_extent->ram_size - 1,
WB_SYNC_ALL);
- else if (ret)
- unlock_page(async_cow->locked_page);
+ else if (ret && async_chunk->locked_page)
+ unlock_page(async_chunk->locked_page);
kfree(async_extent);
cond_resched();
continue;
}
- lock_extent(io_tree, async_extent->start,
- async_extent->start + async_extent->ram_size - 1);
-
ret = btrfs_reserve_extent(root, async_extent->ram_size,
async_extent->compressed_size,
async_extent->compressed_size,
@@ -844,8 +872,6 @@ retry:
extent_clear_unlock_delalloc(inode, async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
- async_extent->start +
- async_extent->ram_size - 1,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK);
@@ -855,7 +881,8 @@ retry:
ins.objectid,
ins.offset, async_extent->pages,
async_extent->nr_pages,
- async_cow->write_flags)) {
+ async_chunk->write_flags,
+ async_chunk->blkcg_css)) {
struct page *p = async_extent->pages[0];
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
@@ -864,7 +891,7 @@ retry:
btrfs_writepage_endio_finish_ordered(p, start, end, 0);
p->mapping = NULL;
- extent_clear_unlock_delalloc(inode, start, end, end,
+ extent_clear_unlock_delalloc(inode, start, end,
NULL, 0,
PAGE_END_WRITEBACK |
PAGE_SET_ERROR);
@@ -882,8 +909,6 @@ out_free:
extent_clear_unlock_delalloc(inode, async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
- async_extent->start +
- async_extent->ram_size - 1,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
@@ -942,9 +967,8 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
*/
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
- u64 start, u64 end, u64 delalloc_end,
- int *page_started, unsigned long *nr_written,
- int unlock, struct btrfs_dedupe_hash *hash)
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written, int unlock)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -983,8 +1007,7 @@ static noinline int cow_file_range(struct inode *inode,
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(inode, start, end,
- delalloc_end, NULL,
+ extent_clear_unlock_delalloc(inode, start, end, NULL,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
@@ -1067,7 +1090,7 @@ static noinline int cow_file_range(struct inode *inode,
extent_clear_unlock_delalloc(inode, start,
start + ram_size - 1,
- delalloc_end, locked_page,
+ locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
if (num_bytes < cur_alloc_size)
@@ -1112,7 +1135,6 @@ out_unlock:
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size,
- start + cur_alloc_size,
locked_page,
clear_bits,
page_ops);
@@ -1120,8 +1142,7 @@ out_unlock:
if (start >= end)
goto out;
}
- extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
- locked_page,
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits | EXTENT_CLEAR_DATA_RESV,
page_ops);
goto out;
@@ -1132,16 +1153,15 @@ out_unlock:
*/
static noinline void async_cow_start(struct btrfs_work *work)
{
- struct async_cow *async_cow;
- int num_added = 0;
- async_cow = container_of(work, struct async_cow, work);
+ struct async_chunk *async_chunk;
+ int compressed_extents;
- compress_file_range(async_cow->inode, async_cow->locked_page,
- async_cow->start, async_cow->end, async_cow,
- &num_added);
- if (num_added == 0) {
- btrfs_add_delayed_iput(async_cow->inode);
- async_cow->inode = NULL;
+ async_chunk = container_of(work, struct async_chunk, work);
+
+ compressed_extents = compress_file_range(async_chunk);
+ if (compressed_extents == 0) {
+ btrfs_add_delayed_iput(async_chunk->inode);
+ async_chunk->inode = NULL;
}
}
@@ -1150,14 +1170,12 @@ static noinline void async_cow_start(struct btrfs_work *work)
*/
static noinline void async_cow_submit(struct btrfs_work *work)
{
- struct btrfs_fs_info *fs_info;
- struct async_cow *async_cow;
+ struct async_chunk *async_chunk = container_of(work, struct async_chunk,
+ work);
+ struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
unsigned long nr_pages;
- async_cow = container_of(work, struct async_cow, work);
-
- fs_info = async_cow->fs_info;
- nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
+ nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
/* atomic_sub_return implies a barrier */
@@ -1166,69 +1184,139 @@ static noinline void async_cow_submit(struct btrfs_work *work)
cond_wake_up_nomb(&fs_info->async_submit_wait);
/*
- * ->inode could be NULL if async_cow_start has failed to compress,
+ * ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
* always adjust ->async_delalloc_pages as its paired with the init
* happening in cow_file_range_async
*/
- if (async_cow->inode)
- submit_compressed_extents(async_cow);
+ if (async_chunk->inode)
+ submit_compressed_extents(async_chunk);
}
static noinline void async_cow_free(struct btrfs_work *work)
{
- struct async_cow *async_cow;
- async_cow = container_of(work, struct async_cow, work);
- if (async_cow->inode)
- btrfs_add_delayed_iput(async_cow->inode);
- kfree(async_cow);
+ struct async_chunk *async_chunk;
+
+ async_chunk = container_of(work, struct async_chunk, work);
+ if (async_chunk->inode)
+ btrfs_add_delayed_iput(async_chunk->inode);
+ if (async_chunk->blkcg_css)
+ css_put(async_chunk->blkcg_css);
+ /*
+ * Since the pointer to 'pending' is at the beginning of the array of
+ * async_chunk's, freeing it ensures the whole array has been freed.
+ */
+ if (atomic_dec_and_test(async_chunk->pending))
+ kvfree(async_chunk->pending);
}
-static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+static int cow_file_range_async(struct inode *inode,
+ struct writeback_control *wbc,
+ struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written,
- unsigned int write_flags)
+ unsigned long *nr_written)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct async_cow *async_cow;
+ struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
+ struct async_cow *ctx;
+ struct async_chunk *async_chunk;
unsigned long nr_pages;
u64 cur_end;
+ u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
+ int i;
+ bool should_compress;
+ unsigned nofs_flag;
+ const unsigned int write_flags = wbc_to_write_flags(wbc);
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+ !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
+ num_chunks = 1;
+ should_compress = false;
+ } else {
+ should_compress = true;
+ }
+
+ nofs_flag = memalloc_nofs_save();
+ ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+
+ if (!ctx) {
+ unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
+ EXTENT_DO_ACCOUNTING;
+ unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+ PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+ PAGE_SET_ERROR;
+
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
+ clear_bits, page_ops);
+ return -ENOMEM;
+ }
+
+ async_chunk = ctx->chunks;
+ atomic_set(&ctx->num_chunks, num_chunks);
+
+ for (i = 0; i < num_chunks; i++) {
+ if (should_compress)
+ cur_end = min(end, start + SZ_512K - 1);
+ else
+ cur_end = end;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
- 1, 0, NULL);
- while (start < end) {
- async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
- BUG_ON(!async_cow); /* -ENOMEM */
/*
* igrab is called higher up in the call chain, take only the
* lightweight reference for the callback lifetime
*/
ihold(inode);
- async_cow->inode = inode;
- async_cow->fs_info = fs_info;
- async_cow->locked_page = locked_page;
- async_cow->start = start;
- async_cow->write_flags = write_flags;
-
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
- !btrfs_test_opt(fs_info, FORCE_COMPRESS))
- cur_end = end;
- else
- cur_end = min(end, start + SZ_512K - 1);
+ async_chunk[i].pending = &ctx->num_chunks;
+ async_chunk[i].inode = inode;
+ async_chunk[i].start = start;
+ async_chunk[i].end = cur_end;
+ async_chunk[i].write_flags = write_flags;
+ INIT_LIST_HEAD(&async_chunk[i].extents);
- async_cow->end = cur_end;
- INIT_LIST_HEAD(&async_cow->extents);
+ /*
+ * The locked_page comes all the way from writepage and its
+ * the original page we were actually given. As we spread
+ * this large delalloc region across multiple async_chunk
+ * structs, only the first struct needs a pointer to locked_page
+ *
+ * This way we don't need racey decisions about who is supposed
+ * to unlock it.
+ */
+ if (locked_page) {
+ /*
+ * Depending on the compressibility, the pages might or
+ * might not go through async. We want all of them to
+ * be accounted against wbc once. Let's do it here
+ * before the paths diverge. wbc accounting is used
+ * only for foreign writeback detection and doesn't
+ * need full accuracy. Just account the whole thing
+ * against the first page.
+ */
+ wbc_account_cgroup_owner(wbc, locked_page,
+ cur_end - start);
+ async_chunk[i].locked_page = locked_page;
+ locked_page = NULL;
+ } else {
+ async_chunk[i].locked_page = NULL;
+ }
+
+ if (blkcg_css != blkcg_root_css) {
+ css_get(blkcg_css);
+ async_chunk[i].blkcg_css = blkcg_css;
+ } else {
+ async_chunk[i].blkcg_css = NULL;
+ }
- btrfs_init_work(&async_cow->work,
- btrfs_delalloc_helper,
- async_cow_start, async_cow_submit,
- async_cow_free);
+ btrfs_init_work(&async_chunk[i].work, async_cow_start,
+ async_cow_submit, async_cow_free);
- nr_pages = (cur_end - start + PAGE_SIZE) >>
- PAGE_SHIFT;
+ nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
atomic_add(nr_pages, &fs_info->async_delalloc_pages);
- btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
+ btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
*nr_written += nr_pages;
start = cur_end + 1;
@@ -1268,36 +1356,25 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
*/
static noinline int run_delalloc_nocow(struct inode *inode,
struct page *locked_page,
- u64 start, u64 end, int *page_started, int force,
- unsigned long *nr_written)
+ const u64 start, const u64 end,
+ int *page_started, int force,
+ unsigned long *nr_written)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_buffer *leaf;
struct btrfs_path *path;
- struct btrfs_file_extent_item *fi;
- struct btrfs_key found_key;
- struct extent_map *em;
- u64 cow_start;
- u64 cur_offset;
- u64 extent_end;
- u64 extent_offset;
- u64 disk_bytenr;
- u64 num_bytes;
- u64 disk_num_bytes;
- u64 ram_bytes;
- int extent_type;
+ u64 cow_start = (u64)-1;
+ u64 cur_offset = start;
int ret;
- int type;
- int nocow;
- int check_prev = 1;
- bool nolock;
+ bool check_prev = true;
+ const bool freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
u64 ino = btrfs_ino(BTRFS_I(inode));
+ bool nocow = false;
+ u64 disk_bytenr = 0;
path = btrfs_alloc_path();
if (!path) {
- extent_clear_unlock_delalloc(inode, start, end, end,
- locked_page,
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, PAGE_UNLOCK |
@@ -1307,15 +1384,29 @@ static noinline int run_delalloc_nocow(struct inode *inode,
return -ENOMEM;
}
- nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
-
- cow_start = (u64)-1;
- cur_offset = start;
while (1) {
+ struct btrfs_key found_key;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ u64 extent_end;
+ u64 extent_offset;
+ u64 num_bytes = 0;
+ u64 disk_num_bytes;
+ u64 ram_bytes;
+ int extent_type;
+
+ nocow = false;
+
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0);
if (ret < 0)
goto error;
+
+ /*
+ * If there is no extent for our range when doing the initial
+ * search, then go back to the previous slot as it will be the
+ * one containing the search offset
+ */
if (ret > 0 && path->slots[0] > 0 && check_prev) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1324,8 +1415,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
found_key.type == BTRFS_EXTENT_DATA_KEY)
path->slots[0]--;
}
- check_prev = 0;
+ check_prev = false;
next_slot:
+ /* Go to next leaf if we have exhausted the current one */
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
@@ -1339,28 +1431,40 @@ next_slot:
leaf = path->nodes[0];
}
- nocow = 0;
- disk_bytenr = 0;
- num_bytes = 0;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ /* Didn't find anything for our INO */
if (found_key.objectid > ino)
break;
+ /*
+ * Keep searching until we find an EXTENT_ITEM or there are no
+ * more extents for this inode
+ */
if (WARN_ON_ONCE(found_key.objectid < ino) ||
found_key.type < BTRFS_EXTENT_DATA_KEY) {
path->slots[0]++;
goto next_slot;
}
+
+ /* Found key is not EXTENT_DATA_KEY or starts after req range */
if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
found_key.offset > end)
break;
+ /*
+ * If the found extent starts after requested offset, then
+ * adjust extent_end to be right before this extent begins
+ */
if (found_key.offset > cur_offset) {
extent_end = found_key.offset;
extent_type = 0;
goto out_check;
}
+ /*
+ * Found extent which begins before our range and potentially
+ * intersect it
+ */
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
@@ -1374,26 +1478,36 @@ next_slot:
btrfs_file_extent_num_bytes(leaf, fi);
disk_num_bytes =
btrfs_file_extent_disk_num_bytes(leaf, fi);
- if (extent_end <= start) {
+ /*
+ * If the extent we got ends before our current offset,
+ * skip to the next extent.
+ */
+ if (extent_end <= cur_offset) {
path->slots[0]++;
goto next_slot;
}
+ /* Skip holes */
if (disk_bytenr == 0)
goto out_check;
+ /* Skip compressed/encrypted/encoded extents */
if (btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
goto out_check;
/*
- * Do the same check as in btrfs_cross_ref_exist but
- * without the unnecessary search.
+ * If extent is created before the last volume's snapshot
+ * this implies the extent is shared, hence we can't do
+ * nocow. This is the same check as in
+ * btrfs_cross_ref_exist but without calling
+ * btrfs_search_slot.
*/
- if (!nolock &&
+ if (!freespace_inode &&
btrfs_file_extent_generation(leaf, fi) <=
btrfs_root_last_snapshot(&root->root_item))
goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
goto out_check;
+ /* If extent is RO, we must COW it */
if (btrfs_extent_readonly(fs_info, disk_bytenr))
goto out_check;
ret = btrfs_cross_ref_exist(root, ino,
@@ -1410,17 +1524,17 @@ next_slot:
goto error;
}
- WARN_ON_ONCE(nolock);
+ WARN_ON_ONCE(freespace_inode);
goto out_check;
}
disk_bytenr += extent_offset;
disk_bytenr += cur_offset - found_key.offset;
num_bytes = min(end + 1, extent_end) - cur_offset;
/*
- * if there are pending snapshots for this root,
- * we fall into common COW way.
+ * If there are pending snapshots for this root, we
+ * fall into common COW way
*/
- if (!nolock && atomic_read(&root->snapshot_force_cow))
+ if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
goto out_check;
/*
* force cow if csum exists in the range.
@@ -1439,27 +1553,29 @@ next_slot:
cur_offset = cow_start;
goto error;
}
- WARN_ON_ONCE(nolock);
+ WARN_ON_ONCE(freespace_inode);
goto out_check;
}
if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
goto out_check;
- nocow = 1;
+ nocow = true;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- extent_end = found_key.offset +
- btrfs_file_extent_ram_bytes(leaf, fi);
- extent_end = ALIGN(extent_end,
- fs_info->sectorsize);
+ extent_end = found_key.offset + ram_bytes;
+ extent_end = ALIGN(extent_end, fs_info->sectorsize);
+ /* Skip extents outside of our requested range */
+ if (extent_end <= start) {
+ path->slots[0]++;
+ goto next_slot;
+ }
} else {
- BUG_ON(1);
+ /* If this triggers then we have a memory corruption */
+ BUG();
}
out_check:
- if (extent_end <= start) {
- path->slots[0]++;
- if (nocow)
- btrfs_dec_nocow_writers(fs_info, disk_bytenr);
- goto next_slot;
- }
+ /*
+ * If nocow is false then record the beginning of the range
+ * that needs to be COWed
+ */
if (!nocow) {
if (cow_start == (u64)-1)
cow_start = cur_offset;
@@ -1471,11 +1587,16 @@ out_check:
}
btrfs_release_path(path);
+
+ /*
+ * COW range from cow_start to found_key.offset - 1. As the key
+ * will contain the beginning of the first extent that can be
+ * NOCOW, following one which needs to be COW'ed
+ */
if (cow_start != (u64)-1) {
ret = cow_file_range(inode, locked_page,
cow_start, found_key.offset - 1,
- end, page_started, nr_written, 1,
- NULL);
+ page_started, nr_written, 1);
if (ret) {
if (nocow)
btrfs_dec_nocow_writers(fs_info,
@@ -1487,6 +1608,7 @@ out_check:
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 orig_start = found_key.offset - extent_offset;
+ struct extent_map *em;
em = create_io_em(inode, cur_offset, num_bytes,
orig_start,
@@ -1503,19 +1625,29 @@ out_check:
goto error;
}
free_extent_map(em);
- }
-
- if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- type = BTRFS_ORDERED_PREALLOC;
+ ret = btrfs_add_ordered_extent(inode, cur_offset,
+ disk_bytenr, num_bytes,
+ num_bytes,
+ BTRFS_ORDERED_PREALLOC);
+ if (ret) {
+ btrfs_drop_extent_cache(BTRFS_I(inode),
+ cur_offset,
+ cur_offset + num_bytes - 1,
+ 0);
+ goto error;
+ }
} else {
- type = BTRFS_ORDERED_NOCOW;
+ ret = btrfs_add_ordered_extent(inode, cur_offset,
+ disk_bytenr, num_bytes,
+ num_bytes,
+ BTRFS_ORDERED_NOCOW);
+ if (ret)
+ goto error;
}
- ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
- num_bytes, num_bytes, type);
if (nocow)
btrfs_dec_nocow_writers(fs_info, disk_bytenr);
- BUG_ON(ret); /* -ENOMEM */
+ nocow = false;
if (root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID)
@@ -1528,7 +1660,7 @@ out_check:
num_bytes);
extent_clear_unlock_delalloc(inode, cur_offset,
- cur_offset + num_bytes - 1, end,
+ cur_offset + num_bytes - 1,
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
@@ -1553,15 +1685,18 @@ out_check:
if (cow_start != (u64)-1) {
cur_offset = end;
- ret = cow_file_range(inode, locked_page, cow_start, end, end,
- page_started, nr_written, 1, NULL);
+ ret = cow_file_range(inode, locked_page, cow_start, end,
+ page_started, nr_written, 1);
if (ret)
goto error;
}
error:
+ if (nocow)
+ btrfs_dec_nocow_writers(fs_info, disk_bytenr);
+
if (ret && cur_offset < end)
- extent_clear_unlock_delalloc(inode, cur_offset, end, end,
+ extent_clear_unlock_delalloc(inode, cur_offset, end,
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
@@ -1602,7 +1737,6 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
{
int ret;
int force_cow = need_force_cow(inode, start, end);
- unsigned int write_flags = wbc_to_write_flags(wbc);
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1610,15 +1744,15 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
- } else if (!inode_need_compress(inode, start, end)) {
- ret = cow_file_range(inode, locked_page, start, end, end,
- page_started, nr_written, 1, NULL);
+ } else if (!inode_can_compress(inode) ||
+ !inode_need_compress(inode, start, end)) {
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1);
} else {
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags);
- ret = cow_file_range_async(inode, locked_page, start, end,
- page_started, nr_written,
- write_flags);
+ ret = cow_file_range_async(inode, wbc, locked_page, start, end,
+ page_started, nr_written);
}
if (ret)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
@@ -1912,17 +2046,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
u64 length = 0;
u64 map_length;
int ret;
+ struct btrfs_io_geometry geom;
if (bio_flags & EXTENT_BIO_COMPRESSED)
return 0;
length = bio->bi_iter.bi_size;
map_length = length;
- ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
- NULL, 0);
+ ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
+ &geom);
if (ret < 0)
return ret;
- if (map_length < length + size)
+
+ if (geom.len < length + size)
return 1;
return 0;
}
@@ -1964,11 +2100,11 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
*
* c-3) otherwise: async submit
*/
-static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
+ int mirror_num,
+ unsigned long bio_flags)
+
{
- struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
@@ -2003,8 +2139,7 @@ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
goto mapit;
/* we're doing a write, do the async checksumming */
ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
- bio_offset, inode,
- btrfs_submit_bio_start);
+ 0, inode, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, 0, 0);
@@ -2013,7 +2148,7 @@ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
}
mapit:
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
out:
if (ret) {
@@ -2046,7 +2181,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
unsigned int extra_bits,
- struct extent_state **cached_state, int dedupe)
+ struct extent_state **cached_state)
{
WARN_ON(PAGE_ALIGNED(end));
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
@@ -2112,17 +2247,21 @@ again:
}
ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
- &cached_state, 0);
+ &cached_state);
if (ret) {
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
ClearPageChecked(page);
- goto out;
+ goto out_reserved;
}
ClearPageChecked(page);
set_page_dirty(page);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
+out_reserved:
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ if (ret)
+ btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ PAGE_SIZE, true);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
@@ -2163,8 +2302,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
SetPageChecked(page);
get_page(page);
- btrfs_init_work(&fixup->work, btrfs_fixup_helper,
- btrfs_writepage_fixup_worker, NULL, NULL);
+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
return -EBUSY;
@@ -2531,6 +2669,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
struct btrfs_file_extent_item *item;
struct btrfs_ordered_extent *ordered;
struct btrfs_trans_handle *trans;
+ struct btrfs_ref ref = { 0 };
struct btrfs_root *root;
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -2577,7 +2716,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
if (IS_ERR(inode)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
return 0;
@@ -2701,10 +2840,11 @@ again:
inode_add_bytes(inode, len);
btrfs_release_path(path);
- ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
- new->disk_len, 0,
- backref->root_id, backref->inum,
- new->file_pos); /* start - extent_offset */
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new->bytenr,
+ new->disk_len, 0);
+ btrfs_init_data_ref(&ref, backref->root_id, backref->inum,
+ new->file_pos); /* start - extent_offset */
+ ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
@@ -2900,7 +3040,7 @@ out_kfree:
static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
u64 start, u64 len)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
@@ -2928,7 +3068,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
int compress_type = 0;
int ret = 0;
u64 logical_len = ordered_extent->len;
- bool nolock;
+ bool freespace_inode;
bool truncated = false;
bool range_locked = false;
bool clear_new_delalloc_bytes = false;
@@ -2939,7 +3079,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
clear_new_delalloc_bytes = true;
- nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
+ freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO;
@@ -2970,8 +3110,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
ordered_extent->len);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
+ if (freespace_inode)
+ trans = btrfs_join_transaction_spacecache(root);
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
@@ -3005,8 +3145,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
EXTENT_DEFRAG, 0, 0, &cached_state);
}
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
+ if (freespace_inode)
+ trans = btrfs_join_transaction_spacecache(root);
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
@@ -3155,7 +3295,6 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered_extent = NULL;
struct btrfs_workqueue *wq;
- btrfs_work_func_t func;
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -3164,16 +3303,12 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
end - start + 1, uptodate))
return;
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (btrfs_is_free_space_inode(BTRFS_I(inode)))
wq = fs_info->endio_freespace_worker;
- func = btrfs_freespace_write_helper;
- } else {
+ else
wq = fs_info->endio_write_workers;
- func = btrfs_endio_write_helper;
- }
- btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
- NULL);
+ btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
btrfs_queue_work(wq, &ordered_extent->work);
}
@@ -3182,16 +3317,23 @@ static int __readpage_endio_check(struct inode *inode,
int icsum, struct page *page,
int pgoff, u64 start, size_t len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
- u32 csum_expected;
- u32 csum = ~(u32)0;
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ u8 *csum_expected;
+ u8 csum[BTRFS_CSUM_SIZE];
- csum_expected = *(((u32 *)io_bio->csum) + icsum);
+ csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size;
kaddr = kmap_atomic(page);
- csum = btrfs_csum_data(kaddr + pgoff, csum, len);
- btrfs_csum_final(csum, (u8 *)&csum);
- if (csum != csum_expected)
+ shash->tfm = fs_info->csum_shash;
+
+ crypto_shash_init(shash);
+ crypto_shash_update(shash, kaddr + pgoff, len);
+ crypto_shash_final(shash, csum);
+
+ if (memcmp(csum, csum_expected, csum_size))
goto zeroit;
kunmap_atomic(kaddr);
@@ -3265,6 +3407,28 @@ void btrfs_add_delayed_iput(struct inode *inode)
wake_up_process(fs_info->cleaner_kthread);
}
+static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *inode)
+{
+ list_del_init(&inode->delayed_iput);
+ spin_unlock(&fs_info->delayed_iput_lock);
+ iput(&inode->vfs_inode);
+ if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
+ wake_up(&fs_info->delayed_iputs_wait);
+ spin_lock(&fs_info->delayed_iput_lock);
+}
+
+static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *inode)
+{
+ if (!list_empty(&inode->delayed_iput)) {
+ spin_lock(&fs_info->delayed_iput_lock);
+ if (!list_empty(&inode->delayed_iput))
+ run_delayed_iput_locked(fs_info, inode);
+ spin_unlock(&fs_info->delayed_iput_lock);
+ }
+}
+
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
{
@@ -3274,12 +3438,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
inode = list_first_entry(&fs_info->delayed_iputs,
struct btrfs_inode, delayed_iput);
- list_del_init(&inode->delayed_iput);
- spin_unlock(&fs_info->delayed_iput_lock);
- iput(&inode->vfs_inode);
- if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
- wake_up(&fs_info->delayed_iputs_wait);
- spin_lock(&fs_info->delayed_iput_lock);
+ run_delayed_iput_locked(fs_info, inode);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
@@ -3408,7 +3567,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &found_key, root);
ret = PTR_ERR_OR_ZERO(inode);
if (ret && ret != -ENOENT)
goto out;
@@ -3699,21 +3858,6 @@ cache_index:
* inode is not a directory, logging its parent unnecessarily.
*/
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
- /*
- * Similar reasoning for last_link_trans, needs to be set otherwise
- * for a case like the following:
- *
- * mkdir A
- * touch foo
- * ln foo A/bar
- * echo 2 > /proc/sys/vm/drop_caches
- * fsync foo
- * <power failure>
- *
- * Would result in link bar and directory A not existing after the power
- * failure.
- */
- BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;
path->slots[0]++;
if (inode->i_nlink != 1 ||
@@ -3795,7 +3939,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
{
struct btrfs_map_token token;
- btrfs_init_map_token(&token);
+ btrfs_init_map_token(&token, leaf);
btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
@@ -3929,9 +4073,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
int ret = 0;
- struct extent_buffer *leaf;
struct btrfs_dir_item *di;
- struct btrfs_key key;
u64 index;
u64 ino = btrfs_ino(inode);
u64 dir_ino = btrfs_ino(dir);
@@ -3949,8 +4091,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = di ? PTR_ERR(di) : -ENOENT;
goto err;
}
- leaf = path->nodes[0];
- btrfs_dir_item_key_to_cpu(leaf, di, &key);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
if (ret)
goto err;
@@ -4003,6 +4143,17 @@ skip_backref:
ret = 0;
else if (ret)
btrfs_abort_transaction(trans, ret);
+
+ /*
+ * If we have a pending delayed iput we could end up with the final iput
+ * being run in btrfs-cleaner context. If we have enough of these built
+ * up we can end up burning a lot of time in btrfs-cleaner without any
+ * way to throttle the unlinks. Since we're currently holding a ref on
+ * the inode we can run the delayed iput here without any issues as the
+ * final iput won't be done until after we drop the ref we're currently
+ * holding.
+ */
+ btrfs_run_delayed_iput(fs_info, inode);
err:
btrfs_free_path(path);
if (ret)
@@ -4087,18 +4238,30 @@ out:
}
static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
- struct inode *dir, u64 objectid,
- const char *name, int name_len)
+ struct inode *dir, struct dentry *dentry)
{
struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
+ const char *name = dentry->d_name.name;
+ int name_len = dentry->d_name.len;
u64 index;
int ret;
+ u64 objectid;
u64 dir_ino = btrfs_ino(BTRFS_I(dir));
+ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
+ objectid = inode->root->root_key.objectid;
+ } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
+ objectid = inode->location.objectid;
+ } else {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -4120,13 +4283,16 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid,
- dir_ino, &index, name, name_len);
- if (ret < 0) {
- if (ret != -ENOENT) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
+ /*
+ * This is a placeholder inode for a subvolume we didn't have a
+ * reference to at the time of the snapshot creation. In the meantime
+ * we could have renamed the real subvol link into our snapshot, so
+ * depending on btrfs_del_root_ref to return -ENOENT here is incorret.
+ * Instead simply lookup the dir_index_item for this entry so we can
+ * remove it. Otherwise we know we have a ref to the root and we can
+ * call btrfs_del_root_ref, and it _shouldn't_ fail.
+ */
+ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
di = btrfs_search_dir_index_item(root, path, dir_ino,
name, name_len);
if (IS_ERR_OR_NULL(di)) {
@@ -4141,8 +4307,16 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
index = key.offset;
+ btrfs_release_path(path);
+ } else {
+ ret = btrfs_del_root_ref(trans, objectid,
+ root->root_key.objectid, dir_ino,
+ &index, name, name_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
}
- btrfs_release_path(path);
ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
if (ret) {
@@ -4336,8 +4510,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
- ret = btrfs_unlink_subvol(trans, dir, dest->root_key.objectid,
- dentry->d_name.name, dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, dir, dentry);
if (ret) {
err = ret;
btrfs_abort_transaction(trans, ret);
@@ -4432,10 +4605,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
return PTR_ERR(trans);
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- err = btrfs_unlink_subvol(trans, dir,
- BTRFS_I(inode)->location.objectid,
- dentry->d_name.name,
- dentry->d_name.len);
+ err = btrfs_unlink_subvol(trans, dir, dentry);
goto out;
}
@@ -4679,7 +4849,7 @@ search_again:
btrfs_set_file_extent_ram_bytes(leaf, fi, size);
size = btrfs_file_extent_calc_inline_size(size);
- btrfs_truncate_item(root->fs_info, path, size, 1);
+ btrfs_truncate_item(path, size, 1);
} else if (!del_item) {
/*
* We have to bail so the last_size is set to
@@ -4718,12 +4888,17 @@ delete:
if (found_extent &&
(test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
root == fs_info->tree_root)) {
+ struct btrfs_ref ref = { 0 };
+
btrfs_set_path_blocking(path);
bytes_deleted += extent_num_bytes;
- ret = btrfs_free_extent(trans, root, extent_start,
- extent_num_bytes, 0,
- btrfs_header_owner(leaf),
- ino, extent_offset);
+
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
+ extent_start, extent_num_bytes, 0);
+ ref.real_root = root->root_key.objectid;
+ btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+ ino, extent_offset);
+ ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
@@ -4844,7 +5019,7 @@ again:
if (!page) {
btrfs_delalloc_release_space(inode, data_reserved,
block_start, blocksize, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
ret = -ENOMEM;
goto out;
}
@@ -4879,12 +5054,11 @@ again:
}
clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, &cached_state);
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ 0, 0, &cached_state);
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
- &cached_state, 0);
+ &cached_state);
if (ret) {
unlock_extent_cached(io_tree, block_start, block_end,
&cached_state);
@@ -4912,7 +5086,7 @@ out_unlock:
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, block_start,
blocksize, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
unlock_page(page);
put_page(page);
out:
@@ -4997,21 +5171,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
if (size <= hole_start)
return 0;
- while (1) {
- struct btrfs_ordered_extent *ordered;
-
- lock_extent_bits(io_tree, hole_start, block_end - 1,
- &cached_state);
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start,
- block_end - hole_start);
- if (!ordered)
- break;
- unlock_extent_cached(io_tree, hole_start, block_end - 1,
- &cached_state);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- }
-
+ btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start,
+ block_end - 1, &cached_state);
cur_offset = hole_start;
while (1) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
@@ -5047,7 +5208,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
- hole_em->bdev = fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = fs_info->generation;
@@ -5278,9 +5438,9 @@ static void evict_inode_truncate_pages(struct inode *inode)
btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
clear_extent_bit(io_tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY |
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 1, &cached_state);
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+ &cached_state);
cond_resched();
spin_lock(&io_tree->lock);
@@ -5293,59 +5453,50 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
- int failures = 0;
-
- for (;;) {
- struct btrfs_trans_handle *trans;
- int ret;
-
- ret = btrfs_block_rsv_refill(root, rsv,
- rsv->size + delayed_refs_extra,
- BTRFS_RESERVE_FLUSH_LIMIT);
-
- if (ret && ++failures > 2) {
- btrfs_warn(fs_info,
- "could not allocate space for a delete; will truncate on mount");
- return ERR_PTR(-ENOSPC);
- }
-
- /*
- * Evict can generate a large amount of delayed refs without
- * having a way to add space back since we exhaust our temporary
- * block rsv. We aren't allowed to do FLUSH_ALL in this case
- * because we could deadlock with so many things in the flushing
- * code, so we have to try and hold some extra space to
- * compensate for our delayed ref generation. If we can't get
- * that space then we need see if we can steal our minimum from
- * the global reserve. We will be ratelimited by the amount of
- * space we have for the delayed refs rsv, so we'll end up
- * committing and trying again.
- */
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans) || !ret) {
- if (!IS_ERR(trans)) {
- trans->block_rsv = &fs_info->trans_block_rsv;
- trans->bytes_reserved = delayed_refs_extra;
- btrfs_block_rsv_migrate(rsv, trans->block_rsv,
- delayed_refs_extra, 1);
- }
- return trans;
- }
+ struct btrfs_trans_handle *trans;
+ u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
+ int ret;
+ /*
+ * Eviction should be taking place at some place safe because of our
+ * delayed iputs. However the normal flushing code will run delayed
+ * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
+ *
+ * We reserve the delayed_refs_extra here again because we can't use
+ * btrfs_start_transaction(root, 0) for the same deadlocky reason as
+ * above. We reserve our extra bit here because we generate a ton of
+ * delayed refs activity by truncating.
+ *
+ * If we cannot make our reservation we'll attempt to steal from the
+ * global reserve, because we really want to be able to free up space.
+ */
+ ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
+ BTRFS_RESERVE_FLUSH_EVICT);
+ if (ret) {
/*
* Try to steal from the global reserve if there is space for
* it.
*/
- if (!btrfs_check_space_for_delayed_refs(fs_info) &&
- !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0))
- return trans;
+ if (btrfs_check_space_for_delayed_refs(fs_info) ||
+ btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
+ btrfs_warn(fs_info,
+ "could not allocate space for delete; will truncate on mount");
+ return ERR_PTR(-ENOSPC);
+ }
+ delayed_refs_extra = 0;
+ }
- /* If not, commit and try again. */
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ERR_PTR(ret);
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return trans;
+
+ if (delayed_refs_extra) {
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ trans->bytes_reserved = delayed_refs_extra;
+ btrfs_block_rsv_migrate(rsv, trans->block_rsv,
+ delayed_refs_extra, 1);
}
+ return trans;
}
void btrfs_evict_inode(struct inode *inode)
@@ -5392,7 +5543,7 @@ void btrfs_evict_inode(struct inode *inode)
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
goto no_delete;
- rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1);
+ rsv->size = btrfs_calc_metadata_size(fs_info, 1);
rsv->failfast = 1;
btrfs_i_size_write(BTRFS_I(inode), 0);
@@ -5448,12 +5599,14 @@ no_delete:
}
/*
- * this returns the key found in the dir entry in the location pointer.
+ * Return the key found in the dir entry in the location pointer, fill @type
+ * with BTRFS_FT_*, and return 0.
+ *
* If no dir entries were found, returns -ENOENT.
* If found a corrupted location in dir entry, returns -EUCLEAN.
*/
static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
- struct btrfs_key *location)
+ struct btrfs_key *location, u8 *type)
{
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
@@ -5482,6 +5635,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
__func__, name, btrfs_ino(BTRFS_I(dir)),
location->objectid, location->type, location->offset);
}
+ if (!ret)
+ *type = btrfs_dir_type(path->nodes[0], di);
out:
btrfs_free_path(path);
return ret;
@@ -5592,7 +5747,6 @@ static void inode_tree_add(struct inode *inode)
static void inode_tree_del(struct inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
int empty = 0;
@@ -5605,7 +5759,6 @@ static void inode_tree_del(struct inode *inode)
spin_unlock(&root->inode_lock);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
- synchronize_srcu(&fs_info->subvol_srcu);
spin_lock(&root->inode_lock);
empty = RB_EMPTY_ROOT(&root->inode_tree);
spin_unlock(&root->inode_lock);
@@ -5649,12 +5802,14 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
return inode;
}
-/* Get an inode object given its location and corresponding root.
- * Returns in *is_new if the inode was read from disk
+/*
+ * Get an inode object given its location and corresponding root.
+ * Path can be preallocated to prevent recursing back to iget through
+ * allocator. NULL is also valid but may require an additional allocation
+ * later.
*/
struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new,
- struct btrfs_path *path)
+ struct btrfs_root *root, struct btrfs_path *path)
{
struct inode *inode;
@@ -5669,8 +5824,6 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
if (!ret) {
inode_tree_add(inode);
unlock_new_inode(inode);
- if (new)
- *new = 1;
} else {
iget_failed(inode);
/*
@@ -5688,9 +5841,9 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
}
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new)
+ struct btrfs_root *root)
{
- return btrfs_iget_path(s, location, root, new, NULL);
+ return btrfs_iget_path(s, location, root, NULL);
}
static struct inode *new_simple_dir(struct super_block *s,
@@ -5719,6 +5872,24 @@ static struct inode *new_simple_dir(struct super_block *s,
return inode;
}
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+ /*
+ * Compile-time asserts that generic FT_* types still match
+ * BTRFS_FT_* types
+ */
+ BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
+ BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
+ BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
+ BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
+ BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
+ BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
+ BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
+ BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
+
+ return fs_umode_to_ftype(inode->i_mode);
+}
+
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@ -5726,18 +5897,31 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
struct btrfs_key location;
+ u8 di_type = 0;
int index;
int ret = 0;
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- ret = btrfs_inode_by_name(dir, dentry, &location);
+ ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
if (ret < 0)
return ERR_PTR(ret);
if (location.type == BTRFS_INODE_ITEM_KEY) {
- inode = btrfs_iget(dir->i_sb, &location, root, NULL);
+ inode = btrfs_iget(dir->i_sb, &location, root);
+ if (IS_ERR(inode))
+ return inode;
+
+ /* Do extra check against inode mode with di_type */
+ if (btrfs_inode_type(inode) != di_type) {
+ btrfs_crit(fs_info,
+"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
+ inode->i_mode, btrfs_inode_type(inode),
+ di_type);
+ iput(inode);
+ return ERR_PTR(-EUCLEAN);
+ }
return inode;
}
@@ -5750,7 +5934,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
else
inode = new_simple_dir(dir->i_sb, &location, sub_root);
} else {
- inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
+ inode = btrfs_iget(dir->i_sb, &location, sub_root);
}
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -5797,10 +5981,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
return d_splice_alias(inode, dentry);
}
-unsigned char btrfs_filetype_table[] = {
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
/*
* All this infrastructure exists because dir_emit can fault, and we are holding
* the tree lock when doing readdir. For now just allocate a buffer and copy
@@ -5939,7 +6119,7 @@ again:
name_ptr = (char *)(entry + 1);
read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
name_len);
- put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)],
+ put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
&entry->type);
btrfs_dir_item_key_to_cpu(leaf, di, &location);
put_unaligned(location.objectid, &entry->ino);
@@ -6190,13 +6370,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
u32 sizes[2];
int nitems = name ? 2 : 1;
unsigned long ptr;
+ unsigned int nofs_flag;
int ret;
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
+ nofs_flag = memalloc_nofs_save();
inode = new_inode(fs_info->sb);
+ memalloc_nofs_restore(nofs_flag);
if (!inode) {
btrfs_free_path(path);
return ERR_PTR(-ENOMEM);
@@ -6342,11 +6525,6 @@ fail:
return ERR_PTR(ret);
}
-static inline u8 btrfs_inode_type(struct inode *inode)
-{
- return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
-}
-
/*
* utility function to add 'inode' into 'parent_inode' with
* a give name and a given sequence number.
@@ -6396,8 +6574,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
name_len * 2);
inode_inc_iversion(&parent_inode->vfs_inode);
- parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime =
- current_time(&parent_inode->vfs_inode);
+ /*
+ * If we are replaying a log tree, we do not want to update the mtime
+ * and ctime of the parent directory with the current time, since the
+ * log replay procedure is responsible for setting them to their correct
+ * values (the ones it had when the fsync was done).
+ */
+ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
+ struct timespec64 now = current_time(&parent_inode->vfs_inode);
+
+ parent_inode->vfs_inode.i_mtime = now;
+ parent_inode->vfs_inode.i_ctime = now;
+ }
ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -6634,7 +6822,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
goto fail;
}
- BTRFS_I(inode)->last_link_trans = trans->transid;
d_instantiate(dentry, inode);
ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
true, NULL);
@@ -6796,8 +6983,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
- if (em)
- em->bdev = fs_info->fs_devices->latest_bdev;
read_unlock(&em_tree->lock);
if (em) {
@@ -6813,7 +6998,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
err = -ENOMEM;
goto out;
}
- em->bdev = fs_info->fs_devices->latest_bdev;
em->start = EXTENT_MAP_HOLE;
em->orig_start = EXTENT_MAP_HOLE;
em->len = (u64)-1;
@@ -6864,6 +7048,14 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
extent_start = found_key.offset;
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ /* Only regular file could have regular/prealloc extent */
+ if (!S_ISREG(inode->vfs_inode.i_mode)) {
+ ret = -EUCLEAN;
+ btrfs_crit(fs_info,
+ "regular/prealloc extent found for non-regular inode %llu",
+ btrfs_ino(inode));
+ goto out;
+ }
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
@@ -7064,7 +7256,6 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
err = -ENOMEM;
goto out;
}
- em->bdev = NULL;
ASSERT(hole_em);
/*
@@ -7424,7 +7615,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
{
struct extent_map_tree *em_tree;
struct extent_map *em;
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
@@ -7442,7 +7632,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
em->len = len;
em->block_len = block_len;
em->block_start = block_start;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
em->orig_block_len = orig_block_len;
em->ram_bytes = ram_bytes;
em->generation = -1;
@@ -7481,6 +7670,8 @@ static int btrfs_get_blocks_direct_read(struct extent_map *em,
struct inode *inode,
u64 start, u64 len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
if (em->block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
return -ENOENT;
@@ -7490,7 +7681,7 @@ static int btrfs_get_blocks_direct_read(struct extent_map *em,
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
inode->i_blkbits;
bh_result->b_size = len;
- bh_result->b_bdev = em->bdev;
+ bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
set_buffer_mapped(bh_result);
return 0;
@@ -7573,7 +7764,7 @@ skip_cow:
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
inode->i_blkbits;
bh_result->b_size = len;
- bh_result->b_bdev = em->bdev;
+ bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
set_buffer_mapped(bh_result);
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
@@ -7604,12 +7795,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
u64 len = bh_result->b_size;
- int unlock_bits = EXTENT_LOCKED;
int ret = 0;
- if (create)
- unlock_bits |= EXTENT_DIRTY;
- else
+ if (!create)
len = min_t(u64, len, fs_info->sectorsize);
lockstart = start;
@@ -7668,9 +7856,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (ret < 0)
goto unlock_err;
- /* clear and unlock the entire range */
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- unlock_bits, 1, 0, &cached_state);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
} else {
ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
start, len);
@@ -7686,9 +7873,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
*/
lockstart = start + bh_result->b_size;
if (lockstart < lockend) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, unlock_bits, 1, 0,
- &cached_state);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, &cached_state);
} else {
free_extent_state(cached_state);
}
@@ -7699,8 +7885,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
return 0;
unlock_err:
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- unlock_bits, 1, 0, &cached_state);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
err:
if (dio_data)
current->journal_info = dio_data;
@@ -7720,7 +7906,7 @@ static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
if (ret)
return ret;
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
return ret;
}
@@ -7828,7 +8014,6 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
struct inode *inode = done->inode;
struct bio_vec *bvec;
struct extent_io_tree *io_tree, *failure_tree;
- int i;
struct bvec_iter_all iter_all;
if (bio->bi_status)
@@ -7841,7 +8026,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
done->uptodate = 1;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i, iter_all)
+ bio_for_each_segment_all(bvec, bio, iter_all)
clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
io_tree, done->start, bvec->bv_page,
btrfs_ino(BTRFS_I(inode)), 0);
@@ -7919,7 +8104,7 @@ static void btrfs_retry_endio(struct bio *bio)
struct bio_vec *bvec;
int uptodate;
int ret;
- int i;
+ int i = 0;
struct bvec_iter_all iter_all;
if (bio->bi_status)
@@ -7934,7 +8119,7 @@ static void btrfs_retry_endio(struct bio *bio)
failure_tree = &BTRFS_I(inode)->io_failure_tree;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i, iter_all) {
+ bio_for_each_segment_all(bvec, bio, iter_all) {
ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
bvec->bv_offset, done->start,
bvec->bv_len);
@@ -7946,6 +8131,7 @@ static void btrfs_retry_endio(struct bio *bio)
bvec->bv_offset);
else
uptodate = 0;
+ i++;
}
done->uptodate = uptodate;
@@ -8073,18 +8259,14 @@ static void __endio_write_update_ordered(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered = NULL;
struct btrfs_workqueue *wq;
- btrfs_work_func_t func;
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
u64 last_offset;
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (btrfs_is_free_space_inode(BTRFS_I(inode)))
wq = fs_info->endio_freespace_worker;
- func = btrfs_freespace_write_helper;
- } else {
+ else
wq = fs_info->endio_write_workers;
- func = btrfs_endio_write_helper;
- }
while (ordered_offset < offset + bytes) {
last_offset = ordered_offset;
@@ -8092,9 +8274,8 @@ static void __endio_write_update_ordered(struct inode *inode,
&ordered_offset,
ordered_bytes,
uptodate)) {
- btrfs_init_work(&ordered->work, func,
- finish_ordered_fn,
- NULL, NULL);
+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
+ NULL);
btrfs_queue_work(wq, &ordered->work);
}
/*
@@ -8251,7 +8432,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
goto err;
}
map:
- ret = btrfs_map_bio(fs_info, bio, 0, 0);
+ ret = btrfs_map_bio(fs_info, bio, 0);
err:
return ret;
}
@@ -8264,22 +8445,21 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
struct bio *orig_bio = dip->orig_bio;
u64 start_sector = orig_bio->bi_iter.bi_sector;
u64 file_offset = dip->logical_offset;
- u64 map_length;
int async_submit = 0;
u64 submit_len;
int clone_offset = 0;
int clone_len;
int ret;
blk_status_t status;
+ struct btrfs_io_geometry geom;
- map_length = orig_bio->bi_iter.bi_size;
- submit_len = map_length;
- ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
- &map_length, NULL, 0);
+ submit_len = orig_bio->bi_iter.bi_size;
+ ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
+ start_sector << 9, submit_len, &geom);
if (ret)
return -EIO;
- if (map_length >= submit_len) {
+ if (geom.len >= submit_len) {
bio = orig_bio;
dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
goto submit;
@@ -8292,10 +8472,10 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
async_submit = 1;
/* bio split */
- ASSERT(map_length <= INT_MAX);
+ ASSERT(geom.len <= INT_MAX);
atomic_inc(&dip->pending_bios);
do {
- clone_len = min_t(int, submit_len, map_length);
+ clone_len = min_t(int, submit_len, geom.len);
/*
* This will never fail as it's passing GPF_NOFS and
@@ -8332,9 +8512,8 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
start_sector += clone_len >> 9;
file_offset += clone_len;
- map_length = submit_len;
- ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
- start_sector << 9, &map_length, NULL, 0);
+ ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
+ start_sector << 9, submit_len, &geom);
if (ret)
goto out_err;
} while (submit_len > 0);
@@ -8586,7 +8765,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, data_reserved,
offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), count);
}
out:
if (wakeup)
@@ -8717,8 +8896,7 @@ again:
*/
if (!inode_evicting)
clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW |
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state);
/*
@@ -8773,8 +8951,7 @@ again:
if (PageDirty(page))
btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
- clear_extent_bit(tree, page_start, page_end,
- EXTENT_LOCKED | EXTENT_DIRTY |
+ clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
&cached_state);
@@ -8902,12 +9079,11 @@ again:
* reserve data&meta space before lock_page() (see above comments).
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, &cached_state);
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 0, 0, &cached_state);
ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0,
- &cached_state, 0);
+ &cached_state);
if (ret2) {
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state);
@@ -8939,7 +9115,7 @@ again:
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
if (!ret2) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
@@ -8948,7 +9124,7 @@ again:
out_unlock:
unlock_page(page);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0));
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
btrfs_delalloc_release_space(inode, data_reserved, page_start,
reserved_space, (ret != 0));
out_noreserve:
@@ -8965,7 +9141,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
int ret;
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
- u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
+ u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
if (!skip_writeback) {
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
@@ -9163,7 +9339,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
- ei->last_link_trans = 0;
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
@@ -9182,13 +9357,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
- extent_io_tree_init(&ei->io_tree, inode);
- extent_io_tree_init(&ei->io_failure_tree, inode);
- ei->io_tree.track_uptodate = 1;
- ei->io_failure_tree.track_uptodate = 1;
+ extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
+ extent_io_tree_init(fs_info, &ei->io_failure_tree,
+ IO_TREE_INODE_IO_FAILURE, inode);
+ ei->io_tree.track_uptodate = true;
+ ei->io_failure_tree.track_uptodate = true;
atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
- mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
@@ -9206,9 +9381,8 @@ void btrfs_test_destroy_inode(struct inode *inode)
}
#endif
-static void btrfs_i_callback(struct rcu_head *head)
+void btrfs_free_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
@@ -9234,7 +9408,7 @@ void btrfs_destroy_inode(struct inode *inode)
* created.
*/
if (!root)
- goto free;
+ return;
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -9252,8 +9426,6 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
-free:
- call_rcu(&inode->i_rcu, btrfs_i_callback);
}
int btrfs_drop_inode(struct inode *inode)
@@ -9288,6 +9460,7 @@ void __cold btrfs_destroy_cachep(void)
kmem_cache_destroy(btrfs_trans_handle_cachep);
kmem_cache_destroy(btrfs_path_cachep);
kmem_cache_destroy(btrfs_free_space_cachep);
+ kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
}
int __init btrfs_init_cachep(void)
@@ -9317,6 +9490,12 @@ int __init btrfs_init_cachep(void)
if (!btrfs_free_space_cachep)
goto fail;
+ btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
+ PAGE_SIZE, PAGE_SIZE,
+ SLAB_RED_ZONE, NULL);
+ if (!btrfs_free_space_bitmap_cachep)
+ goto fail;
+
return 0;
fail:
btrfs_destroy_cachep();
@@ -9376,7 +9555,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
- u64 root_objectid;
int ret;
bool root_log_pinned = false;
bool dest_log_pinned = false;
@@ -9394,9 +9572,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
btrfs_init_log_ctx(&ctx_dest, new_inode);
/* close the race window with snapshot create/destroy ioctl */
- if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
- down_read(&fs_info->subvol_sem);
- if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&fs_info->subvol_sem);
/*
@@ -9413,6 +9590,9 @@ static int btrfs_rename_exchange(struct inode *old_dir,
goto out_notrans;
}
+ if (dest != root)
+ btrfs_record_root_in_trans(trans, dest);
+
/*
* We need to find a free sequence number both in the source and
* in the destination directory for the exchange.
@@ -9430,7 +9610,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* Reference for the source. */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
/* force full log commit if subvolume involved. */
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
} else {
btrfs_pin_log_trans(root);
root_log_pinned = true;
@@ -9447,7 +9627,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* And now for the dest. */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
/* force full log commit if subvolume involved. */
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
} else {
btrfs_pin_log_trans(dest);
dest_log_pinned = true;
@@ -9480,10 +9660,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
- root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
- ret = btrfs_unlink_subvol(trans, old_dir, root_objectid,
- old_dentry->d_name.name,
- old_dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else { /* src is an inode */
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
@@ -9499,10 +9676,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* dest is a subvolume */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
- root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
- ret = btrfs_unlink_subvol(trans, new_dir, root_objectid,
- new_dentry->d_name.name,
- new_dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
} else { /* dest is an inode */
ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
@@ -9583,7 +9757,7 @@ out_fail:
btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
(new_inode &&
btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
if (root_log_pinned) {
btrfs_end_log_trans(root);
@@ -9607,6 +9781,18 @@ out_fail:
commit_transaction = true;
}
if (commit_transaction) {
+ /*
+ * We may have set commit_transaction when logging the new name
+ * in the destination root, in which case we left the source
+ * root context in the list of log contextes. So make sure we
+ * remove it to avoid invalid memory accesses, since the context
+ * was allocated in our stack frame.
+ */
+ if (sync_log_root) {
+ mutex_lock(&root->log_mutex);
+ list_del_init(&ctx_root.list);
+ mutex_unlock(&root->log_mutex);
+ }
ret = btrfs_commit_transaction(trans);
} else {
int ret2;
@@ -9615,11 +9801,13 @@ out_fail:
ret = ret ? ret : ret2;
}
out_notrans:
- if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
- up_read(&fs_info->subvol_sem);
- if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
+ old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
+ ASSERT(list_empty(&ctx_root.list));
+ ASSERT(list_empty(&ctx_dest.list));
+
return ret;
}
@@ -9686,7 +9874,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_inode = d_inode(new_dentry);
struct inode *old_inode = d_inode(old_dentry);
u64 index = 0;
- u64 root_objectid;
int ret;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
@@ -9769,7 +9956,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode)->dir_index = 0ULL;
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
/* force full log commit if subvolume involved. */
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
} else {
btrfs_pin_log_trans(root);
log_pinned = true;
@@ -9794,10 +9981,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode), 1);
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
- root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
- ret = btrfs_unlink_subvol(trans, old_dir, root_objectid,
- old_dentry->d_name.name,
- old_dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
@@ -9816,10 +10000,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_inode->i_ctime = current_time(new_inode);
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- root_objectid = BTRFS_I(new_inode)->location.objectid;
- ret = btrfs_unlink_subvol(trans, new_dir, root_objectid,
- new_dentry->d_name.name,
- new_dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
@@ -9890,7 +10071,7 @@ out_fail:
btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
(new_inode &&
btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
btrfs_end_log_trans(root);
log_pinned = false;
@@ -9964,8 +10145,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
- btrfs_run_delalloc_work, NULL, NULL);
+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
return work;
}
@@ -10193,7 +10373,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &btrfs_symlink_inode_operations;
inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_aops;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(BTRFS_I(inode), name_len);
err = btrfs_update_inode(trans, root, inode);
@@ -10299,7 +10478,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ins.offset;
- em->bdev = fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
em->generation = trans->transid;
@@ -10655,7 +10833,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
start = 0;
while (start < isize) {
u64 logical_block_start, physical_block_start;
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
u64 len = isize - start;
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cd4e693406a0..12ae31e1813e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -43,6 +43,9 @@
#include "qgroup.h"
#include "tree-log.h"
#include "compression.h"
+#include "space-info.h"
+#include "delalloc-space.h"
+#include "block-group.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -189,9 +192,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
struct btrfs_trans_handle *trans;
unsigned int fsflags, old_fsflags;
int ret;
- u64 old_flags;
- unsigned int old_i_flags;
- umode_t mode;
+ const char *comp = NULL;
+ u32 binode_flags = binode->flags;
if (!inode_owner_or_capable(inode))
return -EPERM;
@@ -212,66 +214,59 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
inode_lock(inode);
- old_flags = binode->flags;
- old_i_flags = inode->i_flags;
- mode = inode->i_mode;
-
fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
- if ((fsflags ^ old_fsflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE)) {
- ret = -EPERM;
- goto out_unlock;
- }
- }
+ ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
+ if (ret)
+ goto out_unlock;
if (fsflags & FS_SYNC_FL)
- binode->flags |= BTRFS_INODE_SYNC;
+ binode_flags |= BTRFS_INODE_SYNC;
else
- binode->flags &= ~BTRFS_INODE_SYNC;
+ binode_flags &= ~BTRFS_INODE_SYNC;
if (fsflags & FS_IMMUTABLE_FL)
- binode->flags |= BTRFS_INODE_IMMUTABLE;
+ binode_flags |= BTRFS_INODE_IMMUTABLE;
else
- binode->flags &= ~BTRFS_INODE_IMMUTABLE;
+ binode_flags &= ~BTRFS_INODE_IMMUTABLE;
if (fsflags & FS_APPEND_FL)
- binode->flags |= BTRFS_INODE_APPEND;
+ binode_flags |= BTRFS_INODE_APPEND;
else
- binode->flags &= ~BTRFS_INODE_APPEND;
+ binode_flags &= ~BTRFS_INODE_APPEND;
if (fsflags & FS_NODUMP_FL)
- binode->flags |= BTRFS_INODE_NODUMP;
+ binode_flags |= BTRFS_INODE_NODUMP;
else
- binode->flags &= ~BTRFS_INODE_NODUMP;
+ binode_flags &= ~BTRFS_INODE_NODUMP;
if (fsflags & FS_NOATIME_FL)
- binode->flags |= BTRFS_INODE_NOATIME;
+ binode_flags |= BTRFS_INODE_NOATIME;
else
- binode->flags &= ~BTRFS_INODE_NOATIME;
+ binode_flags &= ~BTRFS_INODE_NOATIME;
if (fsflags & FS_DIRSYNC_FL)
- binode->flags |= BTRFS_INODE_DIRSYNC;
+ binode_flags |= BTRFS_INODE_DIRSYNC;
else
- binode->flags &= ~BTRFS_INODE_DIRSYNC;
+ binode_flags &= ~BTRFS_INODE_DIRSYNC;
if (fsflags & FS_NOCOW_FL) {
- if (S_ISREG(mode)) {
+ if (S_ISREG(inode->i_mode)) {
/*
* It's safe to turn csums off here, no extents exist.
* Otherwise we want the flag to reflect the real COW
* status of the file and will not set it.
*/
if (inode->i_size == 0)
- binode->flags |= BTRFS_INODE_NODATACOW
- | BTRFS_INODE_NODATASUM;
+ binode_flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
} else {
- binode->flags |= BTRFS_INODE_NODATACOW;
+ binode_flags |= BTRFS_INODE_NODATACOW;
}
} else {
/*
* Revert back under same assumptions as above
*/
- if (S_ISREG(mode)) {
+ if (S_ISREG(inode->i_mode)) {
if (inode->i_size == 0)
- binode->flags &= ~(BTRFS_INODE_NODATACOW
- | BTRFS_INODE_NODATASUM);
+ binode_flags &= ~(BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM);
} else {
- binode->flags &= ~BTRFS_INODE_NODATACOW;
+ binode_flags &= ~BTRFS_INODE_NODATACOW;
}
}
@@ -281,57 +276,59 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
* things smaller.
*/
if (fsflags & FS_NOCOMP_FL) {
- binode->flags &= ~BTRFS_INODE_COMPRESS;
- binode->flags |= BTRFS_INODE_NOCOMPRESS;
-
- ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
- if (ret && ret != -ENODATA)
- goto out_drop;
+ binode_flags &= ~BTRFS_INODE_COMPRESS;
+ binode_flags |= BTRFS_INODE_NOCOMPRESS;
} else if (fsflags & FS_COMPR_FL) {
- const char *comp;
if (IS_SWAPFILE(inode)) {
ret = -ETXTBSY;
goto out_unlock;
}
- binode->flags |= BTRFS_INODE_COMPRESS;
- binode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ binode_flags |= BTRFS_INODE_COMPRESS;
+ binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
comp = btrfs_compress_type2str(fs_info->compress_type);
if (!comp || comp[0] == 0)
comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
-
- ret = btrfs_set_prop(inode, "btrfs.compression",
- comp, strlen(comp), 0);
- if (ret)
- goto out_drop;
-
} else {
- ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
- if (ret && ret != -ENODATA)
- goto out_drop;
- binode->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+ binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
- trans = btrfs_start_transaction(root, 1);
+ /*
+ * 1 for inode item
+ * 2 for properties
+ */
+ trans = btrfs_start_transaction(root, 3);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto out_drop;
+ goto out_unlock;
+ }
+
+ if (comp) {
+ ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
+ strlen(comp), 0);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+ } else {
+ ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL,
+ 0, 0);
+ if (ret && ret != -ENODATA) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
}
+ binode->flags = binode_flags;
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, inode);
+ out_end_trans:
btrfs_end_transaction(trans);
- out_drop:
- if (ret) {
- binode->flags = old_flags;
- inode->i_flags = old_i_flags;
- }
-
out_unlock:
inode_unlock(inode);
mnt_drop_write_file(file);
@@ -379,9 +376,7 @@ static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg)
struct btrfs_inode *binode = BTRFS_I(file_inode(file));
struct fsxattr fa;
- memset(&fa, 0, sizeof(fa));
- fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
-
+ simple_fill_fsxattr(&fa, btrfs_inode_flags_to_xflags(binode->flags));
if (copy_to_user(arg, &fa, sizeof(fa)))
return -EFAULT;
@@ -394,7 +389,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
struct btrfs_inode *binode = BTRFS_I(inode);
struct btrfs_root *root = binode->root;
struct btrfs_trans_handle *trans;
- struct fsxattr fa;
+ struct fsxattr fa, old_fa;
unsigned old_flags;
unsigned old_i_flags;
int ret = 0;
@@ -405,7 +400,6 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
if (btrfs_root_readonly(root))
return -EROFS;
- memset(&fa, 0, sizeof(fa));
if (copy_from_user(&fa, arg, sizeof(fa)))
return -EFAULT;
@@ -425,13 +419,11 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
old_flags = binode->flags;
old_i_flags = inode->i_flags;
- /* We need the capabilities to change append-only or immutable inode */
- if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) ||
- (fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) &&
- !capable(CAP_LINUX_IMMUTABLE)) {
- ret = -EPERM;
+ simple_fill_fsxattr(&old_fa,
+ btrfs_inode_flags_to_xflags(binode->flags));
+ ret = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
+ if (ret)
goto out_unlock;
- }
if (fa.fsx_xflags & FS_XFLAG_SYNC)
binode->flags |= BTRFS_INODE_SYNC;
@@ -487,10 +479,9 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
return put_user(inode->i_generation, arg);
}
-static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
+static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_device *device;
struct request_queue *q;
struct fstrim_range range;
@@ -549,7 +540,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
return 0;
}
-int btrfs_is_empty_uuid(u8 *uuid)
+int __pure btrfs_is_empty_uuid(u8 *uuid)
{
int i;
@@ -713,11 +704,17 @@ static noinline int create_subvol(struct inode *dir,
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
ret = btrfs_update_inode(trans, root, dir);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
btrfs_ino(BTRFS_I(dir)), index, name, namelen);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
ret = btrfs_uuid_tree_add(trans, root_item->uuid,
BTRFS_UUID_KEY_SUBVOL, objectid);
@@ -1341,9 +1338,8 @@ again:
lock_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end - 1, &cached_state);
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
- page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
- &cached_state);
+ page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 0, 0, &cached_state);
if (i_done != page_cnt) {
spin_lock(&BTRFS_I(inode)->lock);
@@ -1369,8 +1365,7 @@ again:
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
- false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return i_done;
out:
@@ -1381,8 +1376,7 @@ out:
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
- true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return ret;
@@ -1420,7 +1414,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
return -EINVAL;
if (do_compress) {
- if (range->compress_type > BTRFS_COMPRESS_TYPES)
+ if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
return -EINVAL;
if (range->compress_type)
compress_type = range->compress_type;
@@ -1849,8 +1843,15 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
goto free_args;
}
- if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+ if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) {
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
+ btrfs_warn(fs_info,
+"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7");
+
ptr = &transid;
+ }
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
@@ -2466,7 +2467,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
goto out;
}
- temp_inode = btrfs_iget(sb, &key2, root, NULL);
+ temp_inode = btrfs_iget(sb, &key2, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
goto out;
@@ -2931,8 +2932,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
inode_lock(inode);
err = btrfs_delete_subvolume(dir, dentry);
inode_unlock(inode);
- if (!err)
+ if (!err) {
+ fsnotify_rmdir(dir, dentry);
d_delete(dentry);
+ }
out_dput:
dput(dentry);
@@ -3260,6 +3263,19 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
{
int ret;
u64 i, tail_len, chunk_count;
+ struct btrfs_root *root_dst = BTRFS_I(dst)->root;
+
+ spin_lock(&root_dst->root_item_lock);
+ if (root_dst->send_in_progress) {
+ btrfs_warn_rl(root_dst->fs_info,
+"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
+ root_dst->root_key.objectid,
+ root_dst->send_in_progress);
+ spin_unlock(&root_dst->root_item_lock);
+ return -EAGAIN;
+ }
+ root_dst->dedupe_in_progress++;
+ spin_unlock(&root_dst->root_item_lock);
tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
@@ -3268,7 +3284,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
dst, dst_loff);
if (ret)
- return ret;
+ goto out;
loff += BTRFS_MAX_DEDUPE_LEN;
dst_loff += BTRFS_MAX_DEDUPE_LEN;
@@ -3277,6 +3293,10 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
if (tail_len > 0)
ret = btrfs_extent_same_range(src, loff, tail_len, dst,
dst_loff);
+out:
+ spin_lock(&root_dst->root_item_lock);
+ root_dst->dedupe_in_progress--;
+ spin_unlock(&root_dst->root_item_lock);
return ret;
}
@@ -3314,61 +3334,6 @@ out:
return ret;
}
-static void clone_update_extent_map(struct btrfs_inode *inode,
- const struct btrfs_trans_handle *trans,
- const struct btrfs_path *path,
- const u64 hole_offset,
- const u64 hole_len)
-{
- struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_map *em;
- int ret;
-
- em = alloc_extent_map();
- if (!em) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
- return;
- }
-
- if (path) {
- struct btrfs_file_extent_item *fi;
-
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
- em->generation = -1;
- if (btrfs_file_extent_type(path->nodes[0], fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &inode->runtime_flags);
- } else {
- em->start = hole_offset;
- em->len = hole_len;
- em->ram_bytes = em->len;
- em->orig_start = hole_offset;
- em->block_start = EXTENT_MAP_HOLE;
- em->block_len = 0;
- em->orig_block_len = 0;
- em->compress_type = BTRFS_COMPRESS_NONE;
- em->generation = trans->transid;
- }
-
- while (1) {
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 1);
- write_unlock(&em_tree->lock);
- if (ret != -EEXIST) {
- free_extent_map(em);
- break;
- }
- btrfs_drop_extent_cache(inode, em->start,
- em->start + em->len - 1, 0);
- }
-
- if (ret)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
-}
-
/*
* Make sure we do not end up inserting an inline extent into a file that has
* already other (non-inline) extents. If a file has an inline extent it can
@@ -3509,6 +3474,7 @@ copy_inline_extent:
path->slots[0]),
size);
inode_add_bytes(dst, datal);
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
return 0;
}
@@ -3560,6 +3526,14 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
while (1) {
u64 next_key_min_offset = key.offset + 1;
+ struct btrfs_file_extent_item *extent;
+ int type;
+ u32 size;
+ struct btrfs_key new_key;
+ u64 disko = 0, diskl = 0;
+ u64 datao = 0, datal = 0;
+ u8 comp;
+ u64 drop_start;
/*
* note the key will change type as we walk through the
@@ -3600,75 +3574,115 @@ process_slot:
key.objectid != btrfs_ino(BTRFS_I(src)))
break;
- if (key.type == BTRFS_EXTENT_DATA_KEY) {
- struct btrfs_file_extent_item *extent;
- int type;
- u32 size;
- struct btrfs_key new_key;
- u64 disko = 0, diskl = 0;
- u64 datao = 0, datal = 0;
- u8 comp;
- u64 drop_start;
-
- extent = btrfs_item_ptr(leaf, slot,
- struct btrfs_file_extent_item);
- comp = btrfs_file_extent_compression(leaf, extent);
- type = btrfs_file_extent_type(leaf, extent);
- if (type == BTRFS_FILE_EXTENT_REG ||
- type == BTRFS_FILE_EXTENT_PREALLOC) {
- disko = btrfs_file_extent_disk_bytenr(leaf,
- extent);
- diskl = btrfs_file_extent_disk_num_bytes(leaf,
- extent);
- datao = btrfs_file_extent_offset(leaf, extent);
- datal = btrfs_file_extent_num_bytes(leaf,
- extent);
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
- /* take upper bound, may be compressed */
- datal = btrfs_file_extent_ram_bytes(leaf,
- extent);
- }
+ ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ comp = btrfs_file_extent_compression(leaf, extent);
+ type = btrfs_file_extent_type(leaf, extent);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disko = btrfs_file_extent_disk_bytenr(leaf, extent);
+ diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
+ datao = btrfs_file_extent_offset(leaf, extent);
+ datal = btrfs_file_extent_num_bytes(leaf, extent);
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ /* Take upper bound, may be compressed */
+ datal = btrfs_file_extent_ram_bytes(leaf, extent);
+ }
+
+ /*
+ * The first search might have left us at an extent item that
+ * ends before our target range's start, can happen if we have
+ * holes and NO_HOLES feature enabled.
+ */
+ if (key.offset + datal <= off) {
+ path->slots[0]++;
+ goto process_slot;
+ } else if (key.offset >= off + len) {
+ break;
+ }
+ next_key_min_offset = key.offset + datal;
+ size = btrfs_item_size_nr(leaf, slot);
+ read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+ btrfs_release_path(path);
+ path->leave_spinning = 0;
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.objectid = btrfs_ino(BTRFS_I(inode));
+ if (off <= key.offset)
+ new_key.offset = key.offset + destoff - off;
+ else
+ new_key.offset = destoff;
+
+ /*
+ * Deal with a hole that doesn't have an extent item that
+ * represents it (NO_HOLES feature enabled).
+ * This hole is either in the middle of the cloning range or at
+ * the beginning (fully overlaps it or partially overlaps it).
+ */
+ if (new_key.offset != last_dest_end)
+ drop_start = last_dest_end;
+ else
+ drop_start = new_key.offset;
+
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ struct btrfs_clone_extent_info clone_info;
/*
- * The first search might have left us at an extent
- * item that ends before our target range's start, can
- * happen if we have holes and NO_HOLES feature enabled.
+ * a | --- range to clone ---| b
+ * | ------------- extent ------------- |
*/
- if (key.offset + datal <= off) {
- path->slots[0]++;
- goto process_slot;
- } else if (key.offset >= off + len) {
- break;
+
+ /* Subtract range b */
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
+ /* Subtract range a */
+ if (off > key.offset) {
+ datao += off - key.offset;
+ datal -= off - key.offset;
}
- next_key_min_offset = key.offset + datal;
- size = btrfs_item_size_nr(leaf, slot);
- read_extent_buffer(leaf, buf,
- btrfs_item_ptr_offset(leaf, slot),
- size);
- btrfs_release_path(path);
- path->leave_spinning = 0;
+ clone_info.disk_offset = disko;
+ clone_info.disk_len = diskl;
+ clone_info.data_offset = datao;
+ clone_info.data_len = datal;
+ clone_info.file_offset = new_key.offset;
+ clone_info.extent_buf = buf;
+ clone_info.item_size = size;
+ ret = btrfs_punch_hole_range(inode, path,
+ drop_start,
+ new_key.offset + datal - 1,
+ &clone_info, &trans);
+ if (ret)
+ goto out;
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ u64 skip = 0;
+ u64 trim = 0;
- memcpy(&new_key, &key, sizeof(new_key));
- new_key.objectid = btrfs_ino(BTRFS_I(inode));
- if (off <= key.offset)
- new_key.offset = key.offset + destoff - off;
- else
- new_key.offset = destoff;
+ if (off > key.offset) {
+ skip = off - key.offset;
+ new_key.offset += skip;
+ }
- /*
- * Deal with a hole that doesn't have an extent item
- * that represents it (NO_HOLES feature enabled).
- * This hole is either in the middle of the cloning
- * range or at the beginning (fully overlaps it or
- * partially overlaps it).
- */
- if (new_key.offset != last_dest_end)
- drop_start = last_dest_end;
- else
- drop_start = new_key.offset;
+ if (key.offset + datal > off + len)
+ trim = key.offset + datal - (off + len);
+
+ if (comp && (skip || trim)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ size -= skip + trim;
+ datal -= skip + trim;
/*
+ * If our extent is inline, we know we will drop or
+ * adjust at most 1 extent item in the destination root.
+ *
* 1 - adjusting old extent (we may have to split it)
* 1 - add new extent
* 1 - inode update
@@ -3679,137 +3693,28 @@ process_slot:
goto out;
}
- if (type == BTRFS_FILE_EXTENT_REG ||
- type == BTRFS_FILE_EXTENT_PREALLOC) {
- /*
- * a | --- range to clone ---| b
- * | ------------- extent ------------- |
- */
-
- /* subtract range b */
- if (key.offset + datal > off + len)
- datal = off + len - key.offset;
-
- /* subtract range a */
- if (off > key.offset) {
- datao += off - key.offset;
- datal -= off - key.offset;
- }
-
- ret = btrfs_drop_extents(trans, root, inode,
- drop_start,
- new_key.offset + datal,
- 1);
- if (ret) {
- if (ret != -EOPNOTSUPP)
- btrfs_abort_transaction(trans,
- ret);
- btrfs_end_transaction(trans);
- goto out;
- }
-
- ret = btrfs_insert_empty_item(trans, root, path,
- &new_key, size);
- if (ret) {
+ ret = clone_copy_inline_extent(inode, trans, path,
+ &new_key, drop_start,
+ datal, skip, size, buf);
+ if (ret) {
+ if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- goto out;
- }
-
- leaf = path->nodes[0];
- slot = path->slots[0];
- write_extent_buffer(leaf, buf,
- btrfs_item_ptr_offset(leaf, slot),
- size);
-
- extent = btrfs_item_ptr(leaf, slot,
- struct btrfs_file_extent_item);
-
- /* disko == 0 means it's a hole */
- if (!disko)
- datao = 0;
-
- btrfs_set_file_extent_offset(leaf, extent,
- datao);
- btrfs_set_file_extent_num_bytes(leaf, extent,
- datal);
-
- if (disko) {
- inode_add_bytes(inode, datal);
- ret = btrfs_inc_extent_ref(trans,
- root,
- disko, diskl, 0,
- root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)),
- new_key.offset - datao);
- if (ret) {
- btrfs_abort_transaction(trans,
- ret);
- btrfs_end_transaction(trans);
- goto out;
-
- }
- }
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
- u64 skip = 0;
- u64 trim = 0;
-
- if (off > key.offset) {
- skip = off - key.offset;
- new_key.offset += skip;
- }
-
- if (key.offset + datal > off + len)
- trim = key.offset + datal - (off + len);
-
- if (comp && (skip || trim)) {
- ret = -EINVAL;
- btrfs_end_transaction(trans);
- goto out;
- }
- size -= skip + trim;
- datal -= skip + trim;
-
- ret = clone_copy_inline_extent(inode,
- trans, path,
- &new_key,
- drop_start,
- datal,
- skip, size, buf);
- if (ret) {
- if (ret != -EOPNOTSUPP)
- btrfs_abort_transaction(trans,
- ret);
- btrfs_end_transaction(trans);
- goto out;
- }
- leaf = path->nodes[0];
- slot = path->slots[0];
+ btrfs_end_transaction(trans);
+ goto out;
}
+ }
- /* If we have an implicit hole (NO_HOLES feature). */
- if (drop_start < new_key.offset)
- clone_update_extent_map(BTRFS_I(inode), trans,
- NULL, drop_start,
- new_key.offset - drop_start);
-
- clone_update_extent_map(BTRFS_I(inode), trans,
- path, 0, 0);
+ btrfs_release_path(path);
- btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(path);
+ last_dest_end = ALIGN(new_key.offset + datal,
+ fs_info->sectorsize);
+ ret = clone_finish_inode_update(trans, inode, last_dest_end,
+ destoff, olen, no_time_update);
+ if (ret)
+ goto out;
+ if (new_key.offset + datal >= destoff + len)
+ break;
- last_dest_end = ALIGN(new_key.offset + datal,
- fs_info->sectorsize);
- ret = clone_finish_inode_update(trans, inode,
- last_dest_end,
- destoff, olen,
- no_time_update);
- if (ret)
- goto out;
- if (new_key.offset + datal >= destoff + len)
- break;
- }
btrfs_release_path(path);
key.offset = next_key_min_offset;
@@ -3822,31 +3727,20 @@ process_slot:
if (last_dest_end < destoff + len) {
/*
- * We have an implicit hole (NO_HOLES feature is enabled) that
- * fully or partially overlaps our cloning range at its end.
+ * We have an implicit hole that fully or partially overlaps our
+ * cloning range at its end. This means that we either have the
+ * NO_HOLES feature enabled or the implicit hole happened due to
+ * mixing buffered and direct IO writes against this file.
*/
btrfs_release_path(path);
+ path->leave_spinning = 0;
- /*
- * 1 - remove extent(s)
- * 1 - inode update
- */
- trans = btrfs_start_transaction(root, 2);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
- ret = btrfs_drop_extents(trans, root, inode,
- last_dest_end, destoff + len, 1);
- if (ret) {
- if (ret != -EOPNOTSUPP)
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
+ ret = btrfs_punch_hole_range(inode, path,
+ last_dest_end, destoff + len - 1,
+ NULL, &trans);
+ if (ret)
goto out;
- }
- clone_update_extent_map(BTRFS_I(inode), trans, NULL,
- last_dest_end,
- destoff + len - last_dest_end);
+
ret = clone_finish_inode_update(trans, inode, destoff + len,
destoff, olen, no_time_update);
}
@@ -3948,16 +3842,10 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
return -EXDEV;
}
- if (same_inode)
- inode_lock(inode_in);
- else
- lock_two_nondirectories(inode_in, inode_out);
-
/* don't make the dst file partly checksummed */
if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
(BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
- ret = -EINVAL;
- goto out_unlock;
+ return -EINVAL;
}
/*
@@ -3988,29 +3876,38 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (!same_inode)
inode_dio_wait(inode_out);
+ /*
+ * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
+ *
+ * Btrfs' back references do not have a block level granularity, they
+ * work at the whole extent level.
+ * NOCOW buffered write without data space reserved may not be able
+ * to fall back to CoW due to lack of data space, thus could cause
+ * data loss.
+ *
+ * Here we take a shortcut by flushing the whole inode, so that all
+ * nocow write should reach disk as nocow before we increase the
+ * reference of the extent. We could do better by only flushing NOCOW
+ * data, but that needs extra accounting.
+ *
+ * Also we don't need to check ASYNC_EXTENT, as async extent will be
+ * CoWed anyway, not affecting nocow part.
+ */
+ ret = filemap_flush(inode_in->i_mapping);
+ if (ret < 0)
+ return ret;
+
ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
wb_len);
if (ret < 0)
- goto out_unlock;
+ return ret;
ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
wb_len);
if (ret < 0)
- goto out_unlock;
+ return ret;
- ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
len, remap_flags);
- if (ret < 0 || *len == 0)
- goto out_unlock;
-
- return 0;
-
- out_unlock:
- if (same_inode)
- inode_unlock(inode_in);
- else
- unlock_two_nondirectories(inode_in, inode_out);
-
- return ret;
}
loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
@@ -4025,16 +3922,22 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
+ if (same_inode)
+ inode_lock(src_inode);
+ else
+ lock_two_nondirectories(src_inode, dst_inode);
+
ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
&len, remap_flags);
if (ret < 0 || len == 0)
- return ret;
+ goto out_unlock;
if (remap_flags & REMAP_FILE_DEDUP)
ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
else
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
+out_unlock:
if (same_inode)
inode_unlock(src_inode);
else
@@ -4128,16 +4031,15 @@ out:
static void get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
space->total_bytes = 0;
space->used_bytes = 0;
space->flags = 0;
list_for_each_entry(block_group, groups_list, list) {
space->flags = block_group->flags;
- space->total_bytes += block_group->key.offset;
- space->used_bytes +=
- btrfs_block_group_used(&block_group->item);
+ space->total_bytes += block_group->length;
+ space->used_bytes += block_group->used;
}
}
@@ -4350,7 +4252,19 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
0);
- if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
+ /*
+ * Copy scrub args to user space even if btrfs_scrub_dev() returned an
+ * error. This is important as it allows user space to know how much
+ * progress scrub has done. For example, if scrub is canceled we get
+ * -ECANCELED from btrfs_scrub_dev() and return that error back to user
+ * space. Later user space can inspect the progress from the structure
+ * btrfs_ioctl_scrub_args and resume scrub from where it left off
+ * previously (btrfs-progs does this).
+ * If we fail to copy the btrfs_ioctl_scrub_args structure to user space
+ * then return -EFAULT to signal the structure was not copied or it may
+ * be corrupt and unreliable due to a partial copy.
+ */
+ if (copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
if (!(sa->flags & BTRFS_SCRUB_READONLY))
@@ -5048,10 +4962,9 @@ drop_write:
return ret;
}
-static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
+static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_quota_rescan_args *qsa;
int ret = 0;
@@ -5074,11 +4987,9 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
return ret;
}
-static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -5250,10 +5161,9 @@ out:
return ret;
}
-static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
+static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
size_t len;
int ret;
char label[BTRFS_LABEL_SIZE];
@@ -5337,10 +5247,9 @@ int btrfs_ioctl_get_supported_features(void __user *arg)
return 0;
}
-static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
+static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_ioctl_feature_flags features;
@@ -5359,7 +5268,7 @@ static int check_feature_bits(struct btrfs_fs_info *fs_info,
u64 change_mask, u64 flags, u64 supported_flags,
u64 safe_set, u64 safe_clear)
{
- const char *type = btrfs_feature_set_names[set];
+ const char *type = btrfs_feature_set_name(set);
char *names;
u64 disallowed, unsupported;
u64 set_mask = flags & change_mask;
@@ -5540,8 +5449,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_setflags(file, argp);
case FS_IOC_GETVERSION:
return btrfs_ioctl_getversion(file, argp);
+ case FS_IOC_GETFSLABEL:
+ return btrfs_ioctl_get_fslabel(fs_info, argp);
+ case FS_IOC_SETFSLABEL:
+ return btrfs_ioctl_set_fslabel(file, argp);
case FITRIM:
- return btrfs_ioctl_fitrim(file, argp);
+ return btrfs_ioctl_fitrim(fs_info, argp);
case BTRFS_IOC_SNAP_CREATE:
return btrfs_ioctl_snap_create(file, argp, 0);
case BTRFS_IOC_SNAP_CREATE_V2:
@@ -5646,19 +5559,15 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_QUOTA_RESCAN:
return btrfs_ioctl_quota_rescan(file, argp);
case BTRFS_IOC_QUOTA_RESCAN_STATUS:
- return btrfs_ioctl_quota_rescan_status(file, argp);
+ return btrfs_ioctl_quota_rescan_status(fs_info, argp);
case BTRFS_IOC_QUOTA_RESCAN_WAIT:
- return btrfs_ioctl_quota_rescan_wait(file, argp);
+ return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(fs_info, argp);
- case BTRFS_IOC_GET_FSLABEL:
- return btrfs_ioctl_get_fslabel(file, argp);
- case BTRFS_IOC_SET_FSLABEL:
- return btrfs_ioctl_set_fslabel(file, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
return btrfs_ioctl_get_supported_features(argp);
case BTRFS_IOC_GET_FEATURES:
- return btrfs_ioctl_get_features(file, argp);
+ return btrfs_ioctl_get_features(fs_info, argp);
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
case FS_IOC_FSGETXATTR:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 82b84e4daad1..571c4826c428 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -8,14 +8,194 @@
#include <linux/spinlock.h>
#include <linux/page-flags.h>
#include <asm/bug.h>
+#include "misc.h"
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"
-static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
+/*
+ * Extent buffer locking
+ * =====================
+ *
+ * The locks use a custom scheme that allows to do more operations than are
+ * available fromt current locking primitives. The building blocks are still
+ * rwlock and wait queues.
+ *
+ * Required semantics:
+ *
+ * - reader/writer exclusion
+ * - writer/writer exclusion
+ * - reader/reader sharing
+ * - spinning lock semantics
+ * - blocking lock semantics
+ * - try-lock semantics for readers and writers
+ * - one level nesting, allowing read lock to be taken by the same thread that
+ * already has write lock
+ *
+ * The extent buffer locks (also called tree locks) manage access to eb data
+ * related to the storage in the b-tree (keys, items, but not the individual
+ * members of eb).
+ * We want concurrency of many readers and safe updates. The underlying locking
+ * is done by read-write spinlock and the blocking part is implemented using
+ * counters and wait queues.
+ *
+ * spinning semantics - the low-level rwlock is held so all other threads that
+ * want to take it are spinning on it.
+ *
+ * blocking semantics - the low-level rwlock is not held but the counter
+ * denotes how many times the blocking lock was held;
+ * sleeping is possible
+ *
+ * Write lock always allows only one thread to access the data.
+ *
+ *
+ * Debugging
+ * ---------
+ *
+ * There are additional state counters that are asserted in various contexts,
+ * removed from non-debug build to reduce extent_buffer size and for
+ * performance reasons.
+ *
+ *
+ * Lock nesting
+ * ------------
+ *
+ * A write operation on a tree might indirectly start a look up on the same
+ * tree. This can happen when btrfs_cow_block locks the tree and needs to
+ * lookup free extents.
+ *
+ * btrfs_cow_block
+ * ..
+ * alloc_tree_block_no_bg_flush
+ * btrfs_alloc_tree_block
+ * btrfs_reserve_extent
+ * ..
+ * load_free_space_cache
+ * ..
+ * btrfs_lookup_file_extent
+ * btrfs_search_slot
+ *
+ *
+ * Locking pattern - spinning
+ * --------------------------
+ *
+ * The simple locking scenario, the +--+ denotes the spinning section.
+ *
+ * +- btrfs_tree_lock
+ * | - extent_buffer::rwlock is held
+ * | - no heavy operations should happen, eg. IO, memory allocations, large
+ * | structure traversals
+ * +- btrfs_tree_unock
+*
+*
+ * Locking pattern - blocking
+ * --------------------------
+ *
+ * The blocking write uses the following scheme. The +--+ denotes the spinning
+ * section.
+ *
+ * +- btrfs_tree_lock
+ * |
+ * +- btrfs_set_lock_blocking_write
+ *
+ * - allowed: IO, memory allocations, etc.
+ *
+ * -- btrfs_tree_unlock - note, no explicit unblocking necessary
+ *
+ *
+ * Blocking read is similar.
+ *
+ * +- btrfs_tree_read_lock
+ * |
+ * +- btrfs_set_lock_blocking_read
+ *
+ * - heavy operations allowed
+ *
+ * +- btrfs_tree_read_unlock_blocking
+ * |
+ * +- btrfs_tree_read_unlock
+ *
+ */
+
+#ifdef CONFIG_BTRFS_DEBUG
+static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
+{
+ WARN_ON(eb->spinning_writers);
+ eb->spinning_writers++;
+}
+
+static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
+{
+ WARN_ON(eb->spinning_writers != 1);
+ eb->spinning_writers--;
+}
+
+static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
+{
+ WARN_ON(eb->spinning_writers);
+}
+
+static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
+{
+ atomic_inc(&eb->spinning_readers);
+}
+
+static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb)
+{
+ WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+ atomic_dec(&eb->spinning_readers);
+}
+
+static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb)
+{
+ atomic_inc(&eb->read_locks);
+}
+
+static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb)
+{
+ atomic_dec(&eb->read_locks);
+}
+
+static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+{
+ BUG_ON(!atomic_read(&eb->read_locks));
+}
+
+static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
+{
+ eb->write_locks++;
+}
+
+static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
+{
+ eb->write_locks--;
+}
+#else
+static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { }
+static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { }
+static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { }
+static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { }
+static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { }
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
+static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { }
+static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { }
+static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { }
+static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { }
+#endif
+
+/*
+ * Mark already held read lock as blocking. Can be nested in write lock by the
+ * same thread.
+ *
+ * Use when there are potentially long operations ahead so other thread waiting
+ * on the lock will not actively spin but sleep instead.
+ *
+ * The rwlock is released and blocking reader counter is increased.
+ */
void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
{
+ trace_btrfs_set_lock_blocking_read(eb);
/*
* No lock is required. The lock owner may change if we have a read
* lock, but it won't change to or away from us. If we have the write
@@ -25,13 +205,21 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
return;
btrfs_assert_tree_read_locked(eb);
atomic_inc(&eb->blocking_readers);
- WARN_ON(atomic_read(&eb->spinning_readers) == 0);
- atomic_dec(&eb->spinning_readers);
+ btrfs_assert_spinning_readers_put(eb);
read_unlock(&eb->lock);
}
+/*
+ * Mark already held write lock as blocking.
+ *
+ * Use when there are potentially long operations ahead so other threads
+ * waiting on the lock will not actively spin but sleep instead.
+ *
+ * The rwlock is released and blocking writers is set.
+ */
void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
{
+ trace_btrfs_set_lock_blocking_write(eb);
/*
* No lock is required. The lock owner may change if we have a read
* lock, but it won't change to or away from us. If we have the write
@@ -39,153 +227,138 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
- if (atomic_read(&eb->blocking_writers) == 0) {
- WARN_ON(atomic_read(&eb->spinning_writers) != 1);
- atomic_dec(&eb->spinning_writers);
+ if (eb->blocking_writers == 0) {
+ btrfs_assert_spinning_writers_put(eb);
btrfs_assert_tree_locked(eb);
- atomic_inc(&eb->blocking_writers);
+ WRITE_ONCE(eb->blocking_writers, 1);
write_unlock(&eb->lock);
}
}
-void btrfs_clear_lock_blocking_read(struct extent_buffer *eb)
-{
- /*
- * No lock is required. The lock owner may change if we have a read
- * lock, but it won't change to or away from us. If we have the write
- * lock, we are the owner and it'll never change.
- */
- if (eb->lock_nested && current->pid == eb->lock_owner)
- return;
- BUG_ON(atomic_read(&eb->blocking_readers) == 0);
- read_lock(&eb->lock);
- atomic_inc(&eb->spinning_readers);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_readers))
- cond_wake_up_nomb(&eb->read_lock_wq);
-}
-
-void btrfs_clear_lock_blocking_write(struct extent_buffer *eb)
-{
- /*
- * no lock is required. The lock owner may change if
- * we have a read lock, but it won't change to or away
- * from us. If we have the write lock, we are the owner
- * and it'll never change.
- */
- if (eb->lock_nested && current->pid == eb->lock_owner)
- return;
- BUG_ON(atomic_read(&eb->blocking_writers) != 1);
- write_lock(&eb->lock);
- WARN_ON(atomic_read(&eb->spinning_writers));
- atomic_inc(&eb->spinning_writers);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_writers))
- cond_wake_up_nomb(&eb->write_lock_wq);
-}
-
/*
- * take a spinning read lock. This will wait for any blocking
- * writers
+ * Lock the extent buffer for read. Wait for any writers (spinning or blocking).
+ * Can be nested in write lock by the same thread.
+ *
+ * Use when the locked section does only lightweight actions and busy waiting
+ * would be cheaper than making other threads do the wait/wake loop.
+ *
+ * The rwlock is held upon exit.
*/
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
-again:
- BUG_ON(!atomic_read(&eb->blocking_writers) &&
- current->pid == eb->lock_owner);
+ u64 start_ns = 0;
+ if (trace_btrfs_tree_read_lock_enabled())
+ start_ns = ktime_get_ns();
+again:
read_lock(&eb->lock);
- if (atomic_read(&eb->blocking_writers) &&
- current->pid == eb->lock_owner) {
- /*
- * This extent is already write-locked by our thread. We allow
- * an additional read lock to be added because it's for the same
- * thread. btrfs_find_all_roots() depends on this as it may be
- * called on a partly (write-)locked tree.
- */
- BUG_ON(eb->lock_nested);
- eb->lock_nested = 1;
- read_unlock(&eb->lock);
- return;
- }
- if (atomic_read(&eb->blocking_writers)) {
+ BUG_ON(eb->blocking_writers == 0 &&
+ current->pid == eb->lock_owner);
+ if (eb->blocking_writers) {
+ if (current->pid == eb->lock_owner) {
+ /*
+ * This extent is already write-locked by our thread.
+ * We allow an additional read lock to be added because
+ * it's for the same thread. btrfs_find_all_roots()
+ * depends on this as it may be called on a partly
+ * (write-)locked tree.
+ */
+ BUG_ON(eb->lock_nested);
+ eb->lock_nested = true;
+ read_unlock(&eb->lock);
+ trace_btrfs_tree_read_lock(eb, start_ns);
+ return;
+ }
read_unlock(&eb->lock);
wait_event(eb->write_lock_wq,
- atomic_read(&eb->blocking_writers) == 0);
+ READ_ONCE(eb->blocking_writers) == 0);
goto again;
}
- atomic_inc(&eb->read_locks);
- atomic_inc(&eb->spinning_readers);
+ btrfs_assert_tree_read_locks_get(eb);
+ btrfs_assert_spinning_readers_get(eb);
+ trace_btrfs_tree_read_lock(eb, start_ns);
}
/*
- * take a spinning read lock.
- * returns 1 if we get the read lock and 0 if we don't
- * this won't wait for blocking writers
+ * Lock extent buffer for read, optimistically expecting that there are no
+ * contending blocking writers. If there are, don't wait.
+ *
+ * Return 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
{
- if (atomic_read(&eb->blocking_writers))
+ if (READ_ONCE(eb->blocking_writers))
return 0;
read_lock(&eb->lock);
- if (atomic_read(&eb->blocking_writers)) {
+ /* Refetch value after lock */
+ if (READ_ONCE(eb->blocking_writers)) {
read_unlock(&eb->lock);
return 0;
}
- atomic_inc(&eb->read_locks);
- atomic_inc(&eb->spinning_readers);
+ btrfs_assert_tree_read_locks_get(eb);
+ btrfs_assert_spinning_readers_get(eb);
+ trace_btrfs_tree_read_lock_atomic(eb);
return 1;
}
/*
- * returns 1 if we get the read lock and 0 if we don't
- * this won't wait for blocking writers
+ * Try-lock for read. Don't block or wait for contending writers.
+ *
+ * Retrun 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
- if (atomic_read(&eb->blocking_writers))
+ if (READ_ONCE(eb->blocking_writers))
return 0;
if (!read_trylock(&eb->lock))
return 0;
- if (atomic_read(&eb->blocking_writers)) {
+ /* Refetch value after lock */
+ if (READ_ONCE(eb->blocking_writers)) {
read_unlock(&eb->lock);
return 0;
}
- atomic_inc(&eb->read_locks);
- atomic_inc(&eb->spinning_readers);
+ btrfs_assert_tree_read_locks_get(eb);
+ btrfs_assert_spinning_readers_get(eb);
+ trace_btrfs_try_tree_read_lock(eb);
return 1;
}
/*
- * returns 1 if we get the read lock and 0 if we don't
- * this won't wait for blocking writers or readers
+ * Try-lock for write. May block until the lock is uncontended, but does not
+ * wait until it is free.
+ *
+ * Retrun 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
- if (atomic_read(&eb->blocking_writers) ||
- atomic_read(&eb->blocking_readers))
+ if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers))
return 0;
write_lock(&eb->lock);
- if (atomic_read(&eb->blocking_writers) ||
- atomic_read(&eb->blocking_readers)) {
+ /* Refetch value after lock */
+ if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) {
write_unlock(&eb->lock);
return 0;
}
- atomic_inc(&eb->write_locks);
- atomic_inc(&eb->spinning_writers);
+ btrfs_assert_tree_write_locks_get(eb);
+ btrfs_assert_spinning_writers_get(eb);
eb->lock_owner = current->pid;
+ trace_btrfs_try_tree_write_lock(eb);
return 1;
}
/*
- * drop a spinning read lock
+ * Release read lock. Must be used only if the lock is in spinning mode. If
+ * the read lock is nested, must pair with read lock before the write unlock.
+ *
+ * The rwlock is not held upon exit.
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
+ trace_btrfs_tree_read_unlock(eb);
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
@@ -193,21 +366,25 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
* field only matters to the lock owner.
*/
if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = 0;
+ eb->lock_nested = false;
return;
}
btrfs_assert_tree_read_locked(eb);
- WARN_ON(atomic_read(&eb->spinning_readers) == 0);
- atomic_dec(&eb->spinning_readers);
- atomic_dec(&eb->read_locks);
+ btrfs_assert_spinning_readers_put(eb);
+ btrfs_assert_tree_read_locks_put(eb);
read_unlock(&eb->lock);
}
/*
- * drop a blocking read lock
+ * Release read lock, previously set to blocking by a pairing call to
+ * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same
+ * thread.
+ *
+ * State of rwlock is unchanged, last reader wakes waiting threads.
*/
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
{
+ trace_btrfs_tree_read_unlock_blocking(eb);
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
@@ -215,7 +392,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
* field only matters to the lock owner.
*/
if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = 0;
+ eb->lock_nested = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -223,63 +400,126 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
/* atomic_dec_and_test implies a barrier */
if (atomic_dec_and_test(&eb->blocking_readers))
cond_wake_up_nomb(&eb->read_lock_wq);
- atomic_dec(&eb->read_locks);
+ btrfs_assert_tree_read_locks_put(eb);
}
/*
- * take a spinning write lock. This will wait for both
- * blocking readers or writers
+ * Lock for write. Wait for all blocking and spinning readers and writers. This
+ * starts context where reader lock could be nested by the same thread.
+ *
+ * The rwlock is held for write upon exit.
*/
void btrfs_tree_lock(struct extent_buffer *eb)
{
+ u64 start_ns = 0;
+
+ if (trace_btrfs_tree_lock_enabled())
+ start_ns = ktime_get_ns();
+
WARN_ON(eb->lock_owner == current->pid);
again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
- wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
+ wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0);
write_lock(&eb->lock);
+ /* Refetch value after lock */
if (atomic_read(&eb->blocking_readers) ||
- atomic_read(&eb->blocking_writers)) {
+ READ_ONCE(eb->blocking_writers)) {
write_unlock(&eb->lock);
goto again;
}
- WARN_ON(atomic_read(&eb->spinning_writers));
- atomic_inc(&eb->spinning_writers);
- atomic_inc(&eb->write_locks);
+ btrfs_assert_spinning_writers_get(eb);
+ btrfs_assert_tree_write_locks_get(eb);
eb->lock_owner = current->pid;
+ trace_btrfs_tree_lock(eb, start_ns);
}
/*
- * drop a spinning or a blocking write lock.
+ * Release the write lock, either blocking or spinning (ie. there's no need
+ * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
+ * This also ends the context for nesting, the read lock must have been
+ * released already.
+ *
+ * Tasks blocked and waiting are woken, rwlock is not held upon exit.
*/
void btrfs_tree_unlock(struct extent_buffer *eb)
{
- int blockers = atomic_read(&eb->blocking_writers);
+ /*
+ * This is read both locked and unlocked but always by the same thread
+ * that already owns the lock so we don't need to use READ_ONCE
+ */
+ int blockers = eb->blocking_writers;
BUG_ON(blockers > 1);
btrfs_assert_tree_locked(eb);
+ trace_btrfs_tree_unlock(eb);
eb->lock_owner = 0;
- atomic_dec(&eb->write_locks);
+ btrfs_assert_tree_write_locks_put(eb);
if (blockers) {
- WARN_ON(atomic_read(&eb->spinning_writers));
- atomic_dec(&eb->blocking_writers);
- /* Use the lighter barrier after atomic */
- smp_mb__after_atomic();
- cond_wake_up_nomb(&eb->write_lock_wq);
+ btrfs_assert_no_spinning_writers(eb);
+ /* Unlocked write */
+ WRITE_ONCE(eb->blocking_writers, 0);
+ /*
+ * We need to order modifying blocking_writers above with
+ * actually waking up the sleepers to ensure they see the
+ * updated value of blocking_writers
+ */
+ cond_wake_up(&eb->write_lock_wq);
} else {
- WARN_ON(atomic_read(&eb->spinning_writers) != 1);
- atomic_dec(&eb->spinning_writers);
+ btrfs_assert_spinning_writers_put(eb);
write_unlock(&eb->lock);
}
}
-void btrfs_assert_tree_locked(struct extent_buffer *eb)
+/*
+ * Set all locked nodes in the path to blocking locks. This should be done
+ * before scheduling
+ */
+void btrfs_set_path_blocking(struct btrfs_path *p)
{
- BUG_ON(!atomic_read(&eb->write_locks));
+ int i;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (!p->nodes[i] || !p->locks[i])
+ continue;
+ /*
+ * If we currently have a spinning reader or writer lock this
+ * will bump the count of blocking holders and drop the
+ * spinlock.
+ */
+ if (p->locks[i] == BTRFS_READ_LOCK) {
+ btrfs_set_lock_blocking_read(p->nodes[i]);
+ p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
+ } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
+ btrfs_set_lock_blocking_write(p->nodes[i]);
+ p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
+ }
+ }
}
-static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+/*
+ * This releases any locks held in the path starting at level and going all the
+ * way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few corner
+ * cases, such as COW of the block at slot zero in the node. This ignores
+ * those rules, and it should only be called when there are no more updates to
+ * be done higher up in the tree.
+ */
+void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
{
- BUG_ON(!atomic_read(&eb->read_locks));
+ int i;
+
+ if (path->keep_locks)
+ return;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ if (!path->nodes[i])
+ continue;
+ if (!path->locks[i])
+ continue;
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+ path->locks[i] = 0;
+ }
}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 595014f64830..21a285883e89 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_LOCKING_H
#define BTRFS_LOCKING_H
+#include "extent_io.h"
+
#define BTRFS_WRITE_LOCK 1
#define BTRFS_READ_LOCK 2
#define BTRFS_WRITE_LOCK_BLOCKING 3
@@ -19,13 +21,20 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
-void btrfs_clear_lock_blocking_read(struct extent_buffer *eb);
-void btrfs_clear_lock_blocking_write(struct extent_buffer *eb);
-void btrfs_assert_tree_locked(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+#ifdef CONFIG_BTRFS_DEBUG
+static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
+ BUG_ON(!eb->write_locks);
+}
+#else
+static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
+#endif
+
+void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
{
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 579d53ae256f..aa9cd11f4b78 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -63,27 +63,7 @@ struct workspace {
static struct workspace_manager wsm;
-static void lzo_init_workspace_manager(void)
-{
- btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress);
-}
-
-static void lzo_cleanup_workspace_manager(void)
-{
- btrfs_cleanup_workspace_manager(&wsm);
-}
-
-static struct list_head *lzo_get_workspace(unsigned int level)
-{
- return btrfs_get_workspace(&wsm, level);
-}
-
-static void lzo_put_workspace(struct list_head *ws)
-{
- btrfs_put_workspace(&wsm, ws);
-}
-
-static void lzo_free_workspace(struct list_head *ws)
+void lzo_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -93,7 +73,7 @@ static void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *lzo_alloc_workspace(unsigned int level)
+struct list_head *lzo_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
@@ -131,13 +111,9 @@ static inline size_t read_compress_length(const char *buf)
return le32_to_cpu(dlen);
}
-static int lzo_compress_pages(struct list_head *ws,
- struct address_space *mapping,
- u64 start,
- struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out)
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
@@ -303,7 +279,7 @@ out:
return ret;
}
-static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
@@ -444,10 +420,9 @@ done:
return ret;
}
-static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen)
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
@@ -507,20 +482,8 @@ out:
return ret;
}
-static unsigned int lzo_set_level(unsigned int level)
-{
- return 0;
-}
-
const struct btrfs_compress_op btrfs_lzo_compress = {
- .init_workspace_manager = lzo_init_workspace_manager,
- .cleanup_workspace_manager = lzo_cleanup_workspace_manager,
- .get_workspace = lzo_get_workspace,
- .put_workspace = lzo_put_workspace,
- .alloc_workspace = lzo_alloc_workspace,
- .free_workspace = lzo_free_workspace,
- .compress_pages = lzo_compress_pages,
- .decompress_bio = lzo_decompress_bio,
- .decompress = lzo_decompress,
- .set_level = lzo_set_level,
+ .workspace_manager = &wsm,
+ .max_level = 1,
+ .default_level = 1,
};
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
deleted file mode 100644
index 75246f2f56ba..000000000000
--- a/fs/btrfs/math.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2012 Fujitsu. All rights reserved.
- * Written by Miao Xie <miaox@cn.fujitsu.com>
- */
-
-#ifndef BTRFS_MATH_H
-#define BTRFS_MATH_H
-
-#include <asm/div64.h>
-
-static inline u64 div_factor(u64 num, int factor)
-{
- if (factor == 10)
- return num;
- num *= factor;
- return div_u64(num, 10);
-}
-
-static inline u64 div_factor_fine(u64 num, int factor)
-{
- if (factor == 100)
- return num;
- num *= factor;
- return div_u64(num, 100);
-}
-
-#endif
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
new file mode 100644
index 000000000000..72bab64ecf60
--- /dev/null
+++ b/fs/btrfs/misc.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_MISC_H
+#define BTRFS_MISC_H
+
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <asm/div64.h>
+
+#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+
+static inline void cond_wake_up(struct wait_queue_head *wq)
+{
+ /*
+ * This implies a full smp_mb barrier, see comments for
+ * waitqueue_active why.
+ */
+ if (wq_has_sleeper(wq))
+ wake_up(wq);
+}
+
+static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
+{
+ /*
+ * Special case for conditional wakeup where the barrier required for
+ * waitqueue_active is implied by some of the preceding code. Eg. one
+ * of such atomic operations (atomic_dec_and_return, ...), or a
+ * unlock/lock sequence, etc.
+ */
+ if (waitqueue_active(wq))
+ wake_up(wq);
+}
+
+static inline u64 div_factor(u64 num, int factor)
+{
+ if (factor == 10)
+ return num;
+ num *= factor;
+ return div_u64(num, 10);
+}
+
+static inline u64 div_factor_fine(u64 num, int factor)
+{
+ if (factor == 100)
+ return num;
+ num *= factor;
+ return div_u64(num, 100);
+}
+
+/* Copy of is_power_of_two that is 64bit safe */
+static inline bool is_power_of_two_u64(u64 n)
+{
+ return n != 0 && (n & (n - 1)) == 0;
+}
+
+static inline bool has_single_bit_set(u64 n)
+{
+ return is_power_of_two_u64(n);
+}
+
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 45e3cfd1198b..fb09bc2f8e4d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -7,12 +7,14 @@
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include <linux/sched/mm.h>
+#include "misc.h"
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "extent_io.h"
#include "disk-io.h"
#include "compression.h"
+#include "delalloc-space.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -195,8 +197,11 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
- if (dio)
+ if (dio) {
+ percpu_counter_add_batch(&fs_info->dio_bytes, len,
+ fs_info->delalloc_batch);
set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
+ }
/* one ref for the tree */
refcount_set(&entry->refs, 1);
@@ -271,13 +276,12 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
* when an ordered extent is finished. If the list covers more than one
* ordered extent, it is split across multiples.
*/
-void btrfs_add_ordered_sum(struct inode *inode,
- struct btrfs_ordered_extent *entry,
+void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum)
{
struct btrfs_ordered_inode_tree *tree;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &BTRFS_I(entry->inode)->ordered_tree;
spin_lock_irq(&tree->lock);
list_add_tail(&sum->list, &entry->list);
spin_unlock_irq(&tree->lock);
@@ -469,6 +473,10 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (root != fs_info->tree_root)
btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
+ if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+ percpu_counter_add_batch(&fs_info->dio_bytes, -entry->len,
+ fs_info->delalloc_batch);
+
tree = &btrfs_inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = &entry->rb_node;
@@ -539,7 +547,6 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
spin_unlock(&root->ordered_extent_lock);
btrfs_init_work(&ordered->flush_work,
- btrfs_flush_delalloc_helper,
btrfs_run_ordered_extent_work, NULL, NULL);
list_add_tail(&ordered->work_list, &works);
btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
@@ -565,12 +572,11 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
return count;
}
-u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len)
{
struct btrfs_root *root;
struct list_head splice;
- u64 total_done = 0;
u64 done;
INIT_LIST_HEAD(&splice);
@@ -590,7 +596,6 @@ u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
done = btrfs_wait_ordered_extents(root, nr,
range_start, range_len);
btrfs_put_fs_root(root);
- total_done += done;
spin_lock(&fs_info->ordered_root_lock);
if (nr != U64_MAX) {
@@ -600,8 +605,6 @@ u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
list_splice_tail(&splice, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
mutex_unlock(&fs_info->ordered_operations_mutex);
-
- return total_done;
}
/*
@@ -918,14 +921,16 @@ out:
* be reclaimed before their checksum is actually put into the btree
*/
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u32 *sum, int len)
+ u8 *sum, int len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_sum *ordered_sum;
struct btrfs_ordered_extent *ordered;
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
unsigned long num_sectors;
unsigned long i;
u32 sectorsize = btrfs_inode_sectorsize(inode);
+ const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int index = 0;
ordered = btrfs_lookup_ordered_extent(inode, offset);
@@ -941,10 +946,10 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
num_sectors = ordered_sum->len >>
inode->i_sb->s_blocksize_bits;
num_sectors = min_t(int, len - index, num_sectors - i);
- memcpy(sum + index, ordered_sum->sums + i,
- num_sectors);
+ memcpy(sum + index, ordered_sum->sums + i * csum_size,
+ num_sectors * csum_size);
- index += (int)num_sectors;
+ index += (int)num_sectors * csum_size;
if (index == len)
goto out;
disk_bytenr += num_sectors * sectorsize;
@@ -956,6 +961,52 @@ out:
return index;
}
+/*
+ * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
+ * ordered extents in it are run to completion.
+ *
+ * @tree: IO tree used for locking out other users of the range
+ * @inode: Inode whose ordered tree is to be searched
+ * @start: Beginning of range to flush
+ * @end: Last byte of range to lock
+ * @cached_state: If passed, will return the extent state responsible for the
+ * locked range. It's the caller's responsibility to free the cached state.
+ *
+ * This function always returns with the given range locked, ensuring after it's
+ * called no order extent can be pending.
+ */
+void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
+ struct btrfs_inode *inode, u64 start,
+ u64 end,
+ struct extent_state **cached_state)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cache = NULL;
+ struct extent_state **cachedp = &cache;
+
+ if (cached_state)
+ cachedp = cached_state;
+
+ while (1) {
+ lock_extent_bits(tree, start, end, cachedp);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ end - start + 1);
+ if (!ordered) {
+ /*
+ * If no external cached_state has been passed then
+ * decrement the extra ref taken for cachedp since we
+ * aren't exposing it outside of this function
+ */
+ if (!cached_state)
+ refcount_dec(&cache->refs);
+ break;
+ }
+ unlock_extent_cached(tree, start, end, cachedp);
+ btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
int __init ordered_data_init(void)
{
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index fb9a161f0215..4eb0319a86d7 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -23,7 +23,7 @@ struct btrfs_ordered_sum {
int len;
struct list_head list;
/* last field is a variable length array of csums */
- u32 sums[];
+ u8 sums[];
};
/*
@@ -167,8 +167,7 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int compress_type);
-void btrfs_add_ordered_sum(struct inode *inode,
- struct btrfs_ordered_extent *entry,
+void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
u64 file_offset);
@@ -184,11 +183,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u32 *sum, int len);
+ u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
-u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len);
+void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
+ struct btrfs_inode *inode, u64 start,
+ u64 end,
+ struct extent_state **cached_state);
int __init ordered_data_init(void);
void __cold ordered_data_exit(void);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index df49931ffe92..873b6b694107 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -153,11 +153,11 @@ static void print_eb_refs_lock(struct extent_buffer *eb)
#ifdef CONFIG_BTRFS_DEBUG
btrfs_info(eb->fs_info,
"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u",
- atomic_read(&eb->refs), atomic_read(&eb->write_locks),
+ atomic_read(&eb->refs), eb->write_locks,
atomic_read(&eb->read_locks),
- atomic_read(&eb->blocking_writers),
+ eb->blocking_writers,
atomic_read(&eb->blocking_readers),
- atomic_read(&eb->spinning_writers),
+ eb->spinning_writers,
atomic_read(&eb->spinning_readers),
eb->lock_owner, current->pid);
#endif
@@ -189,7 +189,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
btrfs_info(fs_info,
"leaf %llu gen %llu total ptrs %d free space %d owner %llu",
btrfs_header_bytenr(l), btrfs_header_generation(l), nr,
- btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l));
+ btrfs_leaf_free_space(l), btrfs_header_owner(l));
print_eb_refs_lock(l);
for (i = 0 ; i < nr ; i++) {
item = btrfs_item_nr(i);
@@ -266,9 +266,9 @@ void btrfs_print_leaf(struct extent_buffer *l)
struct btrfs_block_group_item);
pr_info(
"\t\tblock group used %llu chunk_objectid %llu flags %llu\n",
- btrfs_disk_block_group_used(l, bi),
- btrfs_disk_block_group_chunk_objectid(l, bi),
- btrfs_disk_block_group_flags(l, bi));
+ btrfs_block_group_used(l, bi),
+ btrfs_block_group_chunk_objectid(l, bi),
+ btrfs_block_group_flags(l, bi));
break;
case BTRFS_CHUNK_ITEM_KEY:
print_chunk(l, btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 61d22a56c0ba..deb59e7cfcac 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -23,36 +23,6 @@ struct prop_handler {
int inheritable;
};
-static int prop_compression_validate(const char *value, size_t len);
-static int prop_compression_apply(struct inode *inode,
- const char *value,
- size_t len);
-static const char *prop_compression_extract(struct inode *inode);
-
-static struct prop_handler prop_handlers[] = {
- {
- .xattr_name = XATTR_BTRFS_PREFIX "compression",
- .validate = prop_compression_validate,
- .apply = prop_compression_apply,
- .extract = prop_compression_extract,
- .inheritable = 1
- },
-};
-
-void __init btrfs_props_init(void)
-{
- int i;
-
- hash_init(prop_handlers_ht);
-
- for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
- struct prop_handler *p = &prop_handlers[i];
- u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
-
- hash_add(prop_handlers_ht, &p->node, h);
- }
-}
-
static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash)
{
struct hlist_head *h;
@@ -85,15 +55,9 @@ find_prop_handler(const char *name,
return NULL;
}
-static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
- struct inode *inode,
- const char *name,
- const char *value,
- size_t value_len,
- int flags)
+int btrfs_validate_prop(const char *name, const char *value, size_t value_len)
{
const struct prop_handler *handler;
- int ret;
if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN)
return -EINVAL;
@@ -102,9 +66,26 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
if (!handler)
return -EINVAL;
+ if (value_len == 0)
+ return 0;
+
+ return handler->validate(value, value_len);
+}
+
+int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
+ const char *name, const char *value, size_t value_len,
+ int flags)
+{
+ const struct prop_handler *handler;
+ int ret;
+
+ handler = find_prop_handler(name, NULL);
+ if (!handler)
+ return -EINVAL;
+
if (value_len == 0) {
ret = btrfs_setxattr(trans, inode, handler->xattr_name,
- NULL, 0, flags);
+ NULL, 0, flags);
if (ret)
return ret;
@@ -114,17 +95,14 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
return ret;
}
- ret = handler->validate(value, value_len);
- if (ret)
- return ret;
- ret = btrfs_setxattr(trans, inode, handler->xattr_name,
- value, value_len, flags);
+ ret = btrfs_setxattr(trans, inode, handler->xattr_name, value,
+ value_len, flags);
if (ret)
return ret;
ret = handler->apply(inode, value, value_len);
if (ret) {
- btrfs_setxattr(trans, inode, handler->xattr_name,
- NULL, 0, flags);
+ btrfs_setxattr(trans, inode, handler->xattr_name, NULL,
+ 0, flags);
return ret;
}
@@ -133,15 +111,6 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_set_prop(struct inode *inode,
- const char *name,
- const char *value,
- size_t value_len,
- int flags)
-{
- return __btrfs_set_prop(NULL, inode, name, value, value_len, flags);
-}
-
static int iterate_object_props(struct btrfs_root *root,
struct btrfs_path *path,
u64 objectid,
@@ -283,6 +252,74 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
return ret;
}
+static int prop_compression_validate(const char *value, size_t len)
+{
+ if (!value)
+ return 0;
+
+ if (btrfs_compress_is_valid_type(value, len))
+ return 0;
+
+ return -EINVAL;
+}
+
+static int prop_compression_apply(struct inode *inode, const char *value,
+ size_t len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ int type;
+
+ if (len == 0) {
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+
+ return 0;
+ }
+
+ if (!strncmp("lzo", value, 3)) {
+ type = BTRFS_COMPRESS_LZO;
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ } else if (!strncmp("zlib", value, 4)) {
+ type = BTRFS_COMPRESS_ZLIB;
+ } else if (!strncmp("zstd", value, 4)) {
+ type = BTRFS_COMPRESS_ZSTD;
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ } else {
+ return -EINVAL;
+ }
+
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->prop_compress = type;
+
+ return 0;
+}
+
+static const char *prop_compression_extract(struct inode *inode)
+{
+ switch (BTRFS_I(inode)->prop_compress) {
+ case BTRFS_COMPRESS_ZLIB:
+ case BTRFS_COMPRESS_LZO:
+ case BTRFS_COMPRESS_ZSTD:
+ return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress);
+ default:
+ break;
+ }
+
+ return NULL;
+}
+
+static struct prop_handler prop_handlers[] = {
+ {
+ .xattr_name = XATTR_BTRFS_PREFIX "compression",
+ .validate = prop_compression_validate,
+ .apply = prop_compression_apply,
+ .extract = prop_compression_extract,
+ .inheritable = 1
+ },
+};
+
static int inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
struct inode *parent)
@@ -291,6 +328,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
int i;
+ bool need_reserve = false;
if (!test_bit(BTRFS_INODE_HAS_PROPS,
&BTRFS_I(parent)->runtime_flags))
@@ -299,7 +337,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
const struct prop_handler *h = &prop_handlers[i];
const char *value;
- u64 num_bytes;
+ u64 num_bytes = 0;
if (!h->inheritable)
continue;
@@ -308,20 +346,51 @@ static int inherit_props(struct btrfs_trans_handle *trans,
if (!value)
continue;
- num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
- ret = btrfs_block_rsv_add(root, trans->block_rsv,
- num_bytes, BTRFS_RESERVE_NO_FLUSH);
- if (ret)
- goto out;
- ret = __btrfs_set_prop(trans, inode, h->xattr_name,
- value, strlen(value), 0);
- btrfs_block_rsv_release(fs_info, trans->block_rsv, num_bytes);
+ /*
+ * This is not strictly necessary as the property should be
+ * valid, but in case it isn't, don't propagate it futher.
+ */
+ ret = h->validate(value, strlen(value));
if (ret)
- goto out;
+ continue;
+
+ /*
+ * Currently callers should be reserving 1 item for properties,
+ * since we only have 1 property that we currently support. If
+ * we add more in the future we need to try and reserve more
+ * space for them. But we should also revisit how we do space
+ * reservations if we do add more properties in the future.
+ */
+ if (need_reserve) {
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ ret = btrfs_block_rsv_add(root, trans->block_rsv,
+ num_bytes, BTRFS_RESERVE_NO_FLUSH);
+ if (ret)
+ return ret;
+ }
+
+ ret = btrfs_setxattr(trans, inode, h->xattr_name, value,
+ strlen(value), 0);
+ if (!ret) {
+ ret = h->apply(inode, value, strlen(value));
+ if (ret)
+ btrfs_setxattr(trans, inode, h->xattr_name,
+ NULL, 0, 0);
+ else
+ set_bit(BTRFS_INODE_HAS_PROPS,
+ &BTRFS_I(inode)->runtime_flags);
+ }
+
+ if (need_reserve) {
+ btrfs_block_rsv_release(fs_info, trans->block_rsv,
+ num_bytes);
+ if (ret)
+ return ret;
+ }
+ need_reserve = true;
}
- ret = 0;
-out:
- return ret;
+
+ return 0;
}
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
@@ -347,11 +416,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- parent_inode = btrfs_iget(sb, &key, parent_root, NULL);
+ parent_inode = btrfs_iget(sb, &key, parent_root);
if (IS_ERR(parent_inode))
return PTR_ERR(parent_inode);
- child_inode = btrfs_iget(sb, &key, root, NULL);
+ child_inode = btrfs_iget(sb, &key, root);
if (IS_ERR(child_inode)) {
iput(parent_inode);
return PTR_ERR(child_inode);
@@ -364,64 +433,15 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
return ret;
}
-static int prop_compression_validate(const char *value, size_t len)
-{
- if (!strncmp("lzo", value, 3))
- return 0;
- else if (!strncmp("zlib", value, 4))
- return 0;
- else if (!strncmp("zstd", value, 4))
- return 0;
-
- return -EINVAL;
-}
-
-static int prop_compression_apply(struct inode *inode,
- const char *value,
- size_t len)
+void __init btrfs_props_init(void)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int type;
-
- if (len == 0) {
- BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
-
- return 0;
- }
-
- if (!strncmp("lzo", value, 3)) {
- type = BTRFS_COMPRESS_LZO;
- btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
- } else if (!strncmp("zlib", value, 4)) {
- type = BTRFS_COMPRESS_ZLIB;
- } else if (!strncmp("zstd", value, 4)) {
- type = BTRFS_COMPRESS_ZSTD;
- btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
- } else {
- return -EINVAL;
- }
-
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->prop_compress = type;
+ int i;
- return 0;
-}
+ for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+ struct prop_handler *p = &prop_handlers[i];
+ u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
-static const char *prop_compression_extract(struct inode *inode)
-{
- switch (BTRFS_I(inode)->prop_compress) {
- case BTRFS_COMPRESS_ZLIB:
- case BTRFS_COMPRESS_LZO:
- case BTRFS_COMPRESS_ZSTD:
- return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress);
- default:
- break;
+ hash_add(prop_handlers_ht, &p->node, h);
}
-
- return NULL;
}
-
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 618815b4f9d5..40b2c65b518c 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -10,11 +10,10 @@
void __init btrfs_props_init(void);
-int btrfs_set_prop(struct inode *inode,
- const char *name,
- const char *value,
- size_t value_len,
+int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
+ const char *name, const char *value, size_t value_len,
int flags);
+int btrfs_validate_prop(const char *name, const char *value, size_t value_len);
int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e659d9d61107..39fc8c3d3a75 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -21,7 +21,7 @@
#include "backref.h"
#include "extent_io.h"
#include "qgroup.h"
-
+#include "block-group.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -918,8 +918,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
/*
* initially create the quota tree
*/
- quota_root = btrfs_create_tree(trans, fs_info,
- BTRFS_QUOTA_TREE_OBJECTID);
+ quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
if (IS_ERR(quota_root)) {
ret = PTR_ERR(quota_root);
btrfs_abort_transaction(trans, ret);
@@ -1101,7 +1100,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
list_del(&quota_root->dirty_list);
btrfs_tree_lock(quota_root->node);
- clean_tree_block(fs_info, quota_root->node);
+ btrfs_clean_tree_block(quota_root->node);
btrfs_tree_unlock(quota_root->node);
btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
@@ -1313,8 +1312,9 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
struct ulist *tmp;
+ bool found = false;
int ret = 0;
- int err;
+ int ret2;
tmp = ulist_alloc(GFP_KERNEL);
if (!tmp)
@@ -1328,28 +1328,39 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
member = find_qgroup_rb(fs_info, src);
parent = find_qgroup_rb(fs_info, dst);
- if (!member || !parent) {
- ret = -EINVAL;
- goto out;
- }
+ /*
+ * The parent/member pair doesn't exist, then try to delete the dead
+ * relation items only.
+ */
+ if (!member || !parent)
+ goto delete_item;
/* check if such qgroup relation exist firstly */
list_for_each_entry(list, &member->groups, next_group) {
- if (list->group == parent)
- goto exist;
+ if (list->group == parent) {
+ found = true;
+ break;
+ }
}
- ret = -ENOENT;
- goto out;
-exist:
+
+delete_item:
ret = del_qgroup_relation_item(trans, src, dst);
- err = del_qgroup_relation_item(trans, dst, src);
- if (err && !ret)
- ret = err;
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ ret2 = del_qgroup_relation_item(trans, dst, src);
+ if (ret2 < 0 && ret2 != -ENOENT)
+ goto out;
- spin_lock(&fs_info->qgroup_lock);
- del_relation_rb(fs_info, src, dst);
- ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
- spin_unlock(&fs_info->qgroup_lock);
+ /* At least one deletion succeeded, return 0 */
+ if (!ret || !ret2)
+ ret = 0;
+
+ if (found) {
+ spin_lock(&fs_info->qgroup_lock);
+ del_relation_rb(fs_info, src, dst);
+ ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
+ spin_unlock(&fs_info->qgroup_lock);
+ }
out:
ulist_free(tmp);
return ret;
@@ -1800,7 +1811,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
/* For src_path */
- extent_buffer_get(src_eb);
+ atomic_inc(&src_eb->refs);
src_path->nodes[root_level] = src_eb;
src_path->slots[root_level] = dst_path->slots[root_level];
src_path->locks[root_level] = 0;
@@ -2056,7 +2067,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
goto out;
}
/* For dst_path */
- extent_buffer_get(dst_eb);
+ atomic_inc(&dst_eb->refs);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
@@ -2115,7 +2126,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
* walk back up the tree (adjusting slot pointers as we go)
* and restart the search process.
*/
- extent_buffer_get(root_eb); /* For path */
+ atomic_inc(&root_eb->refs); /* For path */
path->nodes[root_level] = root_eb;
path->slots[root_level] = 0;
path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
@@ -2412,8 +2423,12 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 nr_old_roots = 0;
int ret = 0;
+ /*
+ * If quotas get disabled meanwhile, the resouces need to be freed and
+ * we can't just exit here.
+ */
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
- return 0;
+ goto out_free;
if (new_roots) {
if (!maybe_fs_roots(new_roots))
@@ -2615,6 +2630,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
int ret = 0;
int i;
u64 *i_qgroups;
+ bool committing = false;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
@@ -2622,7 +2638,25 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
u32 level_size = 0;
u64 nums;
- mutex_lock(&fs_info->qgroup_ioctl_lock);
+ /*
+ * There are only two callers of this function.
+ *
+ * One in create_subvol() in the ioctl context, which needs to hold
+ * the qgroup_ioctl_lock.
+ *
+ * The other one in create_pending_snapshot() where no other qgroup
+ * code can modify the fs as they all need to either start a new trans
+ * or hold a trans handler, thus we don't need to hold
+ * qgroup_ioctl_lock.
+ * This would avoid long and complex lock chain and make lockdep happy.
+ */
+ spin_lock(&fs_info->trans_lock);
+ if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
+ committing = true;
+ spin_unlock(&fs_info->trans_lock);
+
+ if (!committing)
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
goto out;
@@ -2786,7 +2820,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
unlock:
spin_unlock(&fs_info->qgroup_lock);
out:
- mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (!committing)
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
@@ -3135,9 +3170,6 @@ out:
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (!btrfs_fs_closing(fs_info))
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
-
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -3153,16 +3185,30 @@ out:
trans = btrfs_start_transaction(fs_info->quota_root, 1);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
+ trans = NULL;
btrfs_err(fs_info,
"fail to start transaction for status update: %d",
err);
- goto done;
}
- ret = update_qgroup_status_item(trans);
- if (ret < 0) {
- err = ret;
- btrfs_err(fs_info, "fail to update qgroup status: %d", err);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (!btrfs_fs_closing(fs_info))
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ if (trans) {
+ ret = update_qgroup_status_item(trans);
+ if (ret < 0) {
+ err = ret;
+ btrfs_err(fs_info, "fail to update qgroup status: %d",
+ err);
+ }
}
+ fs_info->qgroup_rescan_running = false;
+ complete_all(&fs_info->qgroup_rescan_completion);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ if (!trans)
+ return;
+
btrfs_end_transaction(trans);
if (btrfs_fs_closing(fs_info)) {
@@ -3173,12 +3219,6 @@ out:
} else {
btrfs_err(fs_info, "qgroup scan failed with %d", err);
}
-
-done:
- mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_rescan_running = false;
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- complete_all(&fs_info->qgroup_rescan_completion);
}
/*
@@ -3196,12 +3236,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
btrfs_warn(fs_info,
- "qgroup rescan init failed, qgroup is not enabled");
+ "qgroup rescan init failed, qgroup rescan is not queued");
ret = -EINVAL;
} else if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
btrfs_warn(fs_info,
- "qgroup rescan init failed, qgroup rescan is not queued");
+ "qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
}
@@ -3241,10 +3281,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
- memset(&fs_info->qgroup_rescan_work, 0,
- sizeof(fs_info->qgroup_rescan_work));
btrfs_init_work(&fs_info->qgroup_rescan_work,
- btrfs_qgroup_rescan_helper,
btrfs_qgroup_rescan_worker, NULL, NULL);
return 0;
}
@@ -3406,6 +3443,9 @@ cleanup:
while ((unode = ulist_next(&reserved->range_changed, &uiter)))
clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
+ /* Also free data bytes of already reserved one */
+ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
+ orig_reserved, BTRFS_QGROUP_RSV_DATA);
extent_changeset_release(reserved);
return ret;
}
@@ -3450,7 +3490,7 @@ static int qgroup_free_reserved_data(struct inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
free_start, free_start + free_len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
@@ -3590,7 +3630,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, type, (s64)num_bytes);
+ trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
ret = qgroup_reserve(root, num_bytes, enforce, type);
if (ret < 0)
return ret;
@@ -3637,7 +3677,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
*/
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
+ trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
num_bytes, type);
}
@@ -3787,7 +3827,7 @@ out:
*/
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *subvol_root,
- struct btrfs_block_group_cache *bg,
+ struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
u64 last_snapshot)
@@ -3831,7 +3871,13 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
subvol_slot);
block->last_snapshot = last_snapshot;
block->level = level;
- if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+
+ /*
+ * If we have bg == NULL, we're called from btrfs_recover_relocation(),
+ * no one else can modify tree blocks thus we qgroup will not change
+ * no matter the value of trace_leaf.
+ */
+ if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
block->trace_leaf = true;
else
block->trace_leaf = false;
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 46ba7bd2961c..236f12224d52 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -408,7 +408,7 @@ void btrfs_qgroup_init_swapped_blocks(
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *subvol_root,
- struct btrfs_block_group_cache *bg,
+ struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
u64 last_snapshot);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 67a6f7d47402..a8e53c8e7b01 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -35,6 +35,22 @@
#define RBIO_CACHE_SIZE 1024
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+
+/* Used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+ struct list_head hash_list;
+ spinlock_t lock;
+};
+
+/* Used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+ struct list_head stripe_cache;
+ spinlock_t cache_lock;
+ int cache_size;
+ struct btrfs_stripe_hash table[];
+};
+
enum btrfs_rbio_ops {
BTRFS_RBIO_WRITE,
BTRFS_RBIO_READ_REBUILD,
@@ -174,7 +190,7 @@ static void scrub_parity_work(struct btrfs_work *work);
static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
{
- btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL);
+ btrfs_init_work(&rbio->work, work_func, NULL, NULL);
btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
}
@@ -655,8 +671,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
*/
static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
{
- int bucket = rbio_bucket(rbio);
- struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
+ struct btrfs_stripe_hash *h;
struct btrfs_raid_bio *cur;
struct btrfs_raid_bio *pending;
unsigned long flags;
@@ -664,64 +679,63 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
+ h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
- if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
- spin_lock(&cur->bio_list_lock);
-
- /* can we steal this cached rbio's pages? */
- if (bio_list_empty(&cur->bio_list) &&
- list_empty(&cur->plug_list) &&
- test_bit(RBIO_CACHE_BIT, &cur->flags) &&
- !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
- list_del_init(&cur->hash_list);
- refcount_dec(&cur->refs);
-
- steal_rbio(cur, rbio);
- cache_drop = cur;
- spin_unlock(&cur->bio_list_lock);
+ if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
+ continue;
- goto lockit;
- }
+ spin_lock(&cur->bio_list_lock);
- /* can we merge into the lock owner? */
- if (rbio_can_merge(cur, rbio)) {
- merge_rbio(cur, rbio);
- spin_unlock(&cur->bio_list_lock);
- freeit = rbio;
- ret = 1;
- goto out;
- }
+ /* Can we steal this cached rbio's pages? */
+ if (bio_list_empty(&cur->bio_list) &&
+ list_empty(&cur->plug_list) &&
+ test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+ !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+ list_del_init(&cur->hash_list);
+ refcount_dec(&cur->refs);
+ steal_rbio(cur, rbio);
+ cache_drop = cur;
+ spin_unlock(&cur->bio_list_lock);
- /*
- * we couldn't merge with the running
- * rbio, see if we can merge with the
- * pending ones. We don't have to
- * check for rmw_locked because there
- * is no way they are inside finish_rmw
- * right now
- */
- list_for_each_entry(pending, &cur->plug_list,
- plug_list) {
- if (rbio_can_merge(pending, rbio)) {
- merge_rbio(pending, rbio);
- spin_unlock(&cur->bio_list_lock);
- freeit = rbio;
- ret = 1;
- goto out;
- }
- }
+ goto lockit;
+ }
- /* no merging, put us on the tail of the plug list,
- * our rbio will be started with the currently
- * running rbio unlocks
- */
- list_add_tail(&rbio->plug_list, &cur->plug_list);
+ /* Can we merge into the lock owner? */
+ if (rbio_can_merge(cur, rbio)) {
+ merge_rbio(cur, rbio);
spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
ret = 1;
goto out;
}
+
+
+ /*
+ * We couldn't merge with the running rbio, see if we can merge
+ * with the pending ones. We don't have to check for rmw_locked
+ * because there is no way they are inside finish_rmw right now
+ */
+ list_for_each_entry(pending, &cur->plug_list, plug_list) {
+ if (rbio_can_merge(pending, rbio)) {
+ merge_rbio(pending, rbio);
+ spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
+ ret = 1;
+ goto out;
+ }
+ }
+
+ /*
+ * No merging, put us on the tail of the plug list, our rbio
+ * will be started with the currently running rbio unlocks
+ */
+ list_add_tail(&rbio->plug_list, &cur->plug_list);
+ spin_unlock(&cur->bio_list_lock);
+ ret = 1;
+ goto out;
}
lockit:
refcount_inc(&rbio->refs);
@@ -1442,12 +1456,11 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
static void set_bio_pages_uptodate(struct bio *bio)
{
struct bio_vec *bvec;
- int i;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i, iter_all)
+ bio_for_each_segment_all(bvec, bio, iter_all)
SetPageUptodate(bvec->bv_page);
}
@@ -1728,8 +1741,7 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- btrfs_init_work(&plug->work, btrfs_rmw_helper,
- unplug_work, NULL, NULL);
+ btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
btrfs_queue_work(plug->info->rmw_workers,
&plug->work);
return;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index f5d4c13a8dbc..2503485db859 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -7,7 +7,7 @@
#ifndef BTRFS_RAID56_H
#define BTRFS_RAID56_H
-static inline int nr_parity_stripes(struct map_lookup *map)
+static inline int nr_parity_stripes(const struct map_lookup *map)
{
if (map->type & BTRFS_BLOCK_GROUP_RAID5)
return 1;
@@ -17,7 +17,7 @@ static inline int nr_parity_stripes(struct map_lookup *map)
return 0;
}
-static inline int nr_data_stripes(struct map_lookup *map)
+static inline int nr_data_stripes(const struct map_lookup *map)
{
return map->num_stripes - nr_parity_stripes(map);
}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 10d9589001a9..243a2e44526e 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -14,6 +14,7 @@
#include "disk-io.h"
#include "transaction.h"
#include "dev-replace.h"
+#include "block-group.h"
#undef DEBUG
@@ -226,7 +227,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
struct btrfs_fs_info *fs_info = dev->fs_info;
int ret;
struct reada_zone *zone;
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
u64 start;
u64 end;
int i;
@@ -247,8 +248,8 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
if (!cache)
return NULL;
- start = cache->key.objectid;
- end = start + cache->key.offset - 1;
+ start = cache->start;
+ end = start + cache->length - 1;
btrfs_put_block_group(cache);
zone = kzalloc(sizeof(*zone), GFP_KERNEL);
@@ -638,6 +639,35 @@ static int reada_pick_zone(struct btrfs_device *dev)
return 1;
}
+static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
+ int mirror_num, struct extent_buffer **eb)
+{
+ struct extent_buffer *buf = NULL;
+ int ret;
+
+ buf = btrfs_find_create_tree_block(fs_info, bytenr);
+ if (IS_ERR(buf))
+ return 0;
+
+ set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+
+ ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
+ if (ret) {
+ free_extent_buffer_stale(buf);
+ return ret;
+ }
+
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+ free_extent_buffer_stale(buf);
+ return -EIO;
+ } else if (extent_buffer_uptodate(buf)) {
+ *eb = buf;
+ } else {
+ free_extent_buffer(buf);
+ }
+ return 0;
+}
+
static int reada_start_machine_dev(struct btrfs_device *dev)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -722,21 +752,19 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
static void reada_start_machine_worker(struct btrfs_work *work)
{
struct reada_machine_work *rmw;
- struct btrfs_fs_info *fs_info;
int old_ioprio;
rmw = container_of(work, struct reada_machine_work, work);
- fs_info = rmw->fs_info;
-
- kfree(rmw);
old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
task_nice_ioprio(current));
set_task_ioprio(current, BTRFS_IOPRIO_READA);
- __reada_start_machine(fs_info);
+ __reada_start_machine(rmw->fs_info);
set_task_ioprio(current, old_ioprio);
- atomic_dec(&fs_info->reada_works_cnt);
+ atomic_dec(&rmw->fs_info->reada_works_cnt);
+
+ kfree(rmw);
}
static void __reada_start_machine(struct btrfs_fs_info *fs_info)
@@ -747,6 +775,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
u64 total = 0;
int i;
+again:
do {
enqueued = 0;
mutex_lock(&fs_devices->device_list_mutex);
@@ -758,6 +787,10 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
} while (enqueued && total < 10000);
+ if (fs_devices->seed) {
+ fs_devices = fs_devices->seed;
+ goto again;
+ }
if (enqueued == 0)
return;
@@ -786,8 +819,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
/* FIXME we cannot handle this properly right now */
BUG();
}
- btrfs_init_work(&rmw->work, btrfs_readahead_helper,
- reada_start_machine_worker, NULL, NULL);
+ btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
rmw->fs_info = fs_info;
btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index d09b6cdb785a..b57f3618e58e 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -205,28 +205,17 @@ static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid)
#ifdef CONFIG_STACKTRACE
static void __save_stack_trace(struct ref_action *ra)
{
- struct stack_trace stack_trace;
-
- stack_trace.max_entries = MAX_TRACE;
- stack_trace.nr_entries = 0;
- stack_trace.entries = ra->trace;
- stack_trace.skip = 2;
- save_stack_trace(&stack_trace);
- ra->trace_len = stack_trace.nr_entries;
+ ra->trace_len = stack_trace_save(ra->trace, MAX_TRACE, 2);
}
static void __print_stack_trace(struct btrfs_fs_info *fs_info,
struct ref_action *ra)
{
- struct stack_trace trace;
-
if (ra->trace_len == 0) {
btrfs_err(fs_info, " ref-verify: no stacktrace");
return;
}
- trace.nr_entries = ra->trace_len;
- trace.entries = ra->trace;
- print_stack_trace(&trace, 2);
+ stack_trace_print(ra->trace, ra->trace_len, 2);
}
#else
static void inline __save_stack_trace(struct ref_action *ra)
@@ -511,7 +500,7 @@ static int process_leaf(struct btrfs_root *root,
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
u32 count;
- int i = 0, tree_block_level = 0, ret;
+ int i = 0, tree_block_level = 0, ret = 0;
struct btrfs_key key;
int nritems = btrfs_header_nritems(leaf);
@@ -520,6 +509,7 @@ static int process_leaf(struct btrfs_root *root,
switch (key.type) {
case BTRFS_EXTENT_ITEM_KEY:
*num_bytes = key.offset;
+ /* fall through */
case BTRFS_METADATA_ITEM_KEY:
*bytenr = key.objectid;
ret = process_extent_item(fs_info, path, &key, i,
@@ -670,36 +660,43 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info,
/*
* btrfs_ref_tree_mod: called when we modify a ref for a bytenr
- * @root: the root we are making this modification from.
- * @bytenr: the bytenr we are modifying.
- * @num_bytes: number of bytes.
- * @parent: the parent bytenr.
- * @ref_root: the original root owner of the bytenr.
- * @owner: level in the case of metadata, inode in the case of data.
- * @offset: 0 for metadata, file offset for data.
- * @action: the action that we are doing, this is the same as the delayed ref
- * action.
*
* This will add an action item to the given bytenr and do sanity checks to make
* sure we haven't messed something up. If we are making a new allocation and
* this block entry has history we will delete all previous actions as long as
* our sanity checks pass as they are no longer needed.
*/
-int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
- u64 parent, u64 ref_root, u64 owner, u64 offset,
- int action)
+int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
+ struct btrfs_ref *generic_ref)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct ref_entry *ref = NULL, *exist;
struct ref_action *ra = NULL;
struct block_entry *be = NULL;
struct root_entry *re = NULL;
+ int action = generic_ref->action;
int ret = 0;
- bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+ bool metadata;
+ u64 bytenr = generic_ref->bytenr;
+ u64 num_bytes = generic_ref->len;
+ u64 parent = generic_ref->parent;
+ u64 ref_root;
+ u64 owner;
+ u64 offset;
- if (!btrfs_test_opt(root->fs_info, REF_VERIFY))
+ if (!btrfs_test_opt(fs_info, REF_VERIFY))
return 0;
+ if (generic_ref->type == BTRFS_REF_METADATA) {
+ ref_root = generic_ref->tree_ref.root;
+ owner = generic_ref->tree_ref.level;
+ offset = 0;
+ } else {
+ ref_root = generic_ref->data_ref.ref_root;
+ owner = generic_ref->data_ref.ino;
+ offset = generic_ref->data_ref.offset;
+ }
+ metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
ra = kmalloc(sizeof(struct ref_action), GFP_NOFS);
if (!ra || !ref) {
@@ -732,7 +729,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
INIT_LIST_HEAD(&ra->list);
ra->action = action;
- ra->root = root->root_key.objectid;
+ ra->root = generic_ref->real_root;
/*
* This is an allocation, preallocate the block_entry in case we haven't
@@ -745,7 +742,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* is and the new root objectid, so let's not treat the passed
* in root as if it really has a ref for this bytenr.
*/
- be = add_block_entry(root->fs_info, bytenr, num_bytes, ref_root);
+ be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
if (IS_ERR(be)) {
kfree(ra);
ret = PTR_ERR(be);
@@ -787,13 +784,13 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* one we want to lookup below when we modify the
* re->num_refs.
*/
- ref_root = root->root_key.objectid;
- re->root_objectid = root->root_key.objectid;
+ ref_root = generic_ref->real_root;
+ re->root_objectid = generic_ref->real_root;
re->num_refs = 0;
}
- spin_lock(&root->fs_info->ref_verify_lock);
- be = lookup_block_entry(&root->fs_info->block_tree, bytenr);
+ spin_lock(&fs_info->ref_verify_lock);
+ be = lookup_block_entry(&fs_info->block_tree, bytenr);
if (!be) {
btrfs_err(fs_info,
"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!",
@@ -862,7 +859,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* didn't think of some other corner case.
*/
btrfs_err(fs_info, "failed to find root %llu for %llu",
- root->root_key.objectid, be->bytenr);
+ generic_ref->real_root, be->bytenr);
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
kfree(ra);
@@ -881,7 +878,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
list_add_tail(&ra->list, &be->actions);
ret = 0;
out_unlock:
- spin_unlock(&root->fs_info->ref_verify_lock);
+ spin_unlock(&fs_info->ref_verify_lock);
out:
if (ret)
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index b7d2a4edfdb7..855de37719b5 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h
@@ -9,9 +9,8 @@
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
-int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
- u64 parent, u64 ref_root, u64 owner, u64 offset,
- int action);
+int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
+ struct btrfs_ref *generic_ref);
void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
u64 len);
@@ -30,9 +29,8 @@ static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
{
}
-static inline int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 parent, u64 ref_root,
- u64 owner, u64 offset, int action)
+static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
+ struct btrfs_ref *generic_ref)
{
return 0;
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ddf028509931..da5abd62db22 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -20,6 +20,8 @@
#include "inode-map.h"
#include "qgroup.h"
#include "print-tree.h"
+#include "delalloc-space.h"
+#include "block-group.h"
/*
* backref_node, mapping_node and tree_block start with this
@@ -145,7 +147,7 @@ struct file_extent_cluster {
struct reloc_control {
/* block group to relocate */
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
/* extent tree */
struct btrfs_root *extent_root;
/* inode for moving data */
@@ -515,6 +517,34 @@ static int update_backref_cache(struct btrfs_trans_handle *trans,
return 1;
}
+static bool reloc_root_is_dead(struct btrfs_root *root)
+{
+ /*
+ * Pair with set_bit/clear_bit in clean_dirty_subvols and
+ * btrfs_update_reloc_root. We need to see the updated bit before
+ * trying to access reloc_root
+ */
+ smp_rmb();
+ if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
+ return true;
+ return false;
+}
+
+/*
+ * Check if this subvolume tree has valid reloc tree.
+ *
+ * Reloc tree after swap is considered dead, thus not considered as valid.
+ * This is enough for most callers, as they don't distinguish dead reloc root
+ * from no reloc root. But should_ignore_root() below is a special case.
+ */
+static bool have_reloc_root(struct btrfs_root *root)
+{
+ if (reloc_root_is_dead(root))
+ return false;
+ if (!root->reloc_root)
+ return false;
+ return true;
+}
static int should_ignore_root(struct btrfs_root *root)
{
@@ -523,6 +553,10 @@ static int should_ignore_root(struct btrfs_root *root)
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return 0;
+ /* This root has been merged with its reloc tree, we can ignore it */
+ if (reloc_root_is_dead(root))
+ return 1;
+
reloc_root = root->reloc_root;
if (!reloc_root)
return 0;
@@ -1433,6 +1467,13 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
int clear_rsv = 0;
int ret;
+ /*
+ * The subvolume has reloc tree but the swap is finished, no need to
+ * create/update the dead reloc tree
+ */
+ if (reloc_root_is_dead(root))
+ return 0;
+
if (root->reloc_root) {
reloc_root = root->reloc_root;
reloc_root->last_trans = trans->transid;
@@ -1469,8 +1510,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root_item *root_item;
int ret;
- if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
- !root->reloc_root)
+ if (!have_reloc_root(root))
goto out;
reloc_root = root->reloc_root;
@@ -1480,6 +1520,11 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+ /*
+ * Mark the tree as dead before we change reloc_root so
+ * have_reloc_root will not touch it from now on.
+ */
+ smp_wmb();
__del_reloc_root(reloc_root);
}
@@ -1551,11 +1596,10 @@ again:
return NULL;
}
-static int in_block_group(u64 bytenr,
- struct btrfs_block_group_cache *block_group)
+static int in_block_group(u64 bytenr, struct btrfs_block_group *block_group)
{
- if (bytenr >= block_group->key.objectid &&
- bytenr < block_group->key.objectid + block_group->key.offset)
+ if (bytenr >= block_group->start &&
+ bytenr < block_group->start + block_group->length)
return 1;
return 0;
}
@@ -1643,6 +1687,8 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
nritems = btrfs_header_nritems(leaf);
for (i = 0; i < nritems; i++) {
+ struct btrfs_ref ref = { 0 };
+
cond_resched();
btrfs_item_key_to_cpu(leaf, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY)
@@ -1703,18 +1749,23 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
dirty = 1;
key.offset -= btrfs_file_extent_offset(leaf, fi);
- ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
- num_bytes, parent,
- btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
+ num_bytes, parent);
+ ref.real_root = root->root_key.objectid;
+ btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+ key.objectid, key.offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
- parent, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
+ num_bytes, parent);
+ ref.real_root = root->root_key.objectid;
+ btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+ key.objectid, key.offset);
+ ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
@@ -1756,6 +1807,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
struct btrfs_fs_info *fs_info = dest->fs_info;
struct extent_buffer *eb;
struct extent_buffer *parent;
+ struct btrfs_ref ref = { 0 };
struct btrfs_key key;
u64 old_bytenr;
u64 new_bytenr;
@@ -1916,23 +1968,31 @@ again:
path->slots[level], old_ptr_gen);
btrfs_mark_buffer_dirty(path->nodes[level]);
- ret = btrfs_inc_extent_ref(trans, src, old_bytenr,
- blocksize, path->nodes[level]->start,
- src->root_key.objectid, level - 1, 0);
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
+ blocksize, path->nodes[level]->start);
+ ref.skip_qgroup = true;
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
+ ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret);
- ret = btrfs_inc_extent_ref(trans, dest, new_bytenr,
- blocksize, 0, dest->root_key.objectid,
- level - 1, 0);
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
+ blocksize, 0);
+ ref.skip_qgroup = true;
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
+ ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret);
- ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
- path->nodes[level]->start,
- src->root_key.objectid, level - 1, 0);
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
+ blocksize, path->nodes[level]->start);
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
+ ref.skip_qgroup = true;
+ ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret);
- ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
- 0, dest->root_key.objectid, level - 1,
- 0);
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
+ blocksize, 0);
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
+ ref.skip_qgroup = true;
+ ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret);
btrfs_unlock_up_safe(path, 0);
@@ -2161,22 +2221,35 @@ static int clean_dirty_subvols(struct reloc_control *rc)
struct btrfs_root *root;
struct btrfs_root *next;
int ret = 0;
+ int ret2;
list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
reloc_dirty_list) {
- struct btrfs_root *reloc_root = root->reloc_root;
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ /* Merged subvolume, cleanup its reloc root */
+ struct btrfs_root *reloc_root = root->reloc_root;
- clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
- list_del_init(&root->reloc_dirty_list);
- root->reloc_root = NULL;
- if (reloc_root) {
- int ret2;
+ list_del_init(&root->reloc_dirty_list);
+ root->reloc_root = NULL;
+ if (reloc_root) {
- ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
+ ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
+ if (ret2 < 0 && !ret)
+ ret = ret2;
+ }
+ /*
+ * Need barrier to ensure clear_bit() only happens after
+ * root->reloc_root = NULL. Pairs with have_reloc_root.
+ */
+ smp_wmb();
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+ btrfs_put_fs_root(root);
+ } else {
+ /* Orphan reloc tree, just clean it up */
+ ret2 = btrfs_drop_snapshot(root, NULL, 0, 1);
if (ret2 < 0 && !ret)
ret = ret2;
}
- btrfs_put_fs_root(root);
}
return ret;
}
@@ -2213,7 +2286,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_root_level(root_item);
- extent_buffer_get(reloc_root->node);
+ atomic_inc(&reloc_root->node->refs);
path->nodes[level] = reloc_root->node;
path->slots[level] = 0;
} else {
@@ -2464,6 +2537,9 @@ again:
}
} else {
list_del_init(&reloc_root->root_list);
+ /* Don't forget to queue this reloc root for cleanup */
+ list_add_tail(&reloc_root->reloc_dirty_list,
+ &rc->dirty_subvol_roots);
}
}
@@ -2721,6 +2797,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
struct btrfs_key first_key;
+ struct btrfs_ref ref = { 0 };
cond_resched();
@@ -2826,11 +2903,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
trans->transid);
btrfs_mark_buffer_dirty(upper->eb);
- ret = btrfs_inc_extent_ref(trans, root,
- node->eb->start, blocksize,
- upper->eb->start,
- btrfs_header_owner(upper->eb),
- node->level, 0);
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+ node->eb->start, blocksize,
+ upper->eb->start);
+ ref.real_root = root->root_key.objectid;
+ btrfs_init_tree_ref(&ref, node->level,
+ btrfs_header_owner(upper->eb));
+ ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret);
ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -3156,7 +3235,6 @@ static noinline_for_stack
int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
u64 block_start)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
int ret = 0;
@@ -3169,7 +3247,6 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
em->len = end + 1 - start;
em->block_len = em->len;
em->block_start = block_start;
- em->bdev = fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
@@ -3238,6 +3315,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!page) {
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ PAGE_SIZE);
ret = -ENOMEM;
goto out;
}
@@ -3258,7 +3337,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE, true);
+ PAGE_SIZE);
ret = -EIO;
goto out;
}
@@ -3280,14 +3359,14 @@ static int relocate_file_extent_cluster(struct inode *inode,
}
ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
- NULL, 0);
+ NULL);
if (ret) {
unlock_page(page);
put_page(page);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE, true);
+ PAGE_SIZE);
clear_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end,
@@ -3303,8 +3382,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
put_page(page);
index++;
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE,
- false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
}
@@ -3504,7 +3582,7 @@ static int block_use_full_backref(struct reloc_control *rc,
}
static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct inode *inode,
u64 ino)
{
@@ -3520,7 +3598,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
if (IS_ERR(inode))
return -ENOENT;
@@ -3823,7 +3901,7 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
u64 start, end, last;
int ret;
- last = rc->block_group->key.objectid + rc->block_group->key.offset;
+ last = rc->block_group->start + rc->block_group->length;
while (1) {
cond_resched();
if (rc->search_start >= last) {
@@ -3940,7 +4018,7 @@ int prepare_to_relocate(struct reloc_control *rc)
return -ENOMEM;
memset(&rc->cluster, 0, sizeof(rc->cluster));
- rc->search_start = rc->block_group->key.objectid;
+ rc->search_start = rc->block_group->start;
rc->extents_found = 0;
rc->nodes_relocated = 0;
rc->merging_rsv_size = 0;
@@ -4179,7 +4257,7 @@ out:
*/
static noinline_for_stack
struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *group)
+ struct btrfs_block_group *group)
{
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
@@ -4206,9 +4284,9 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
key.objectid = objectid;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
BUG_ON(IS_ERR(inode));
- BTRFS_I(inode)->index_cnt = group->key.objectid;
+ BTRFS_I(inode)->index_cnt = group->start;
err = btrfs_orphan_add(trans, BTRFS_I(inode));
out:
@@ -4222,7 +4300,7 @@ out:
return inode;
}
-static struct reloc_control *alloc_reloc_control(void)
+static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
{
struct reloc_control *rc;
@@ -4234,7 +4312,8 @@ static struct reloc_control *alloc_reloc_control(void)
INIT_LIST_HEAD(&rc->dirty_subvol_roots);
backref_cache_init(&rc->backref_cache);
mapping_tree_init(&rc->reloc_root_tree);
- extent_io_tree_init(&rc->processed_blocks, NULL);
+ extent_io_tree_init(fs_info, &rc->processed_blocks,
+ IO_TREE_RELOC_BLOCKS, NULL);
return rc;
}
@@ -4242,7 +4321,7 @@ static struct reloc_control *alloc_reloc_control(void)
* Print the block group being relocated
*/
static void describe_relocation(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
char buf[128] = {'\0'};
@@ -4250,7 +4329,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
btrfs_info(fs_info,
"relocating block group %llu flags %s",
- block_group->key.objectid, buf);
+ block_group->start, buf);
}
/*
@@ -4258,7 +4337,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
*/
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
{
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
struct btrfs_root *extent_root = fs_info->extent_root;
struct reloc_control *rc;
struct inode *inode;
@@ -4276,7 +4355,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
return -ETXTBSY;
}
- rc = alloc_reloc_control();
+ rc = alloc_reloc_control(fs_info);
if (!rc) {
btrfs_put_block_group(bg);
return -ENOMEM;
@@ -4285,7 +4364,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
rc->extent_root = extent_root;
rc->block_group = bg;
- ret = btrfs_inc_block_group_ro(rc->block_group);
+ ret = btrfs_inc_block_group_ro(rc->block_group, true);
if (ret) {
err = ret;
goto out;
@@ -4298,7 +4377,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
goto out;
}
- inode = lookup_free_space_inode(fs_info, rc->block_group, path);
+ inode = lookup_free_space_inode(rc->block_group, path);
btrfs_free_path(path);
if (!IS_ERR(inode))
@@ -4323,39 +4402,48 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
btrfs_wait_ordered_roots(fs_info, U64_MAX,
- rc->block_group->key.objectid,
- rc->block_group->key.offset);
+ rc->block_group->start,
+ rc->block_group->length);
while (1) {
mutex_lock(&fs_info->cleaner_mutex);
ret = relocate_block_group(rc);
mutex_unlock(&fs_info->cleaner_mutex);
- if (ret < 0) {
+ if (ret < 0)
err = ret;
- goto out;
- }
-
- if (rc->extents_found == 0)
- break;
-
- btrfs_info(fs_info, "found %llu extents", rc->extents_found);
+ /*
+ * We may have gotten ENOSPC after we already dirtied some
+ * extents. If writeout happens while we're relocating a
+ * different block group we could end up hitting the
+ * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
+ * btrfs_reloc_cow_block. Make sure we write everything out
+ * properly so we don't trip over this problem, and then break
+ * out of the loop if we hit an error.
+ */
if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
ret = btrfs_wait_ordered_range(rc->data_inode, 0,
(u64)-1);
- if (ret) {
+ if (ret)
err = ret;
- goto out;
- }
invalidate_mapping_pages(rc->data_inode->i_mapping,
0, -1);
rc->stage = UPDATE_DATA_PTRS;
}
+
+ if (err < 0)
+ goto out;
+
+ if (rc->extents_found == 0)
+ break;
+
+ btrfs_info(fs_info, "found %llu extents", rc->extents_found);
+
}
WARN_ON(rc->block_group->pinned > 0);
WARN_ON(rc->block_group->reserved > 0);
- WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
+ WARN_ON(rc->block_group->used > 0);
out:
if (err && rw)
btrfs_dec_block_group_ro(rc->block_group);
@@ -4472,7 +4560,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (list_empty(&reloc_roots))
goto out;
- rc = alloc_reloc_control();
+ rc = alloc_reloc_control(fs_info);
if (!rc) {
err = -ENOMEM;
goto out;
@@ -4505,6 +4593,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
if (IS_ERR(fs_root)) {
err = PTR_ERR(fs_root);
+ list_add_tail(&reloc_root->root_list, &reloc_roots);
goto out_free;
}
@@ -4594,7 +4683,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
sums->bytenr = new_bytenr;
- btrfs_add_ordered_sum(inode, ordered, sums);
+ btrfs_add_ordered_sum(ordered, sums);
}
out:
btrfs_put_ordered_extent(ordered);
@@ -4638,7 +4727,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
node->new_bytenr != buf->start);
drop_node_buffer(node);
- extent_buffer_get(cow);
+ atomic_inc(&cow->refs);
node->eb = cow;
node->new_bytenr = cow->start;
@@ -4667,14 +4756,12 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve)
{
- struct btrfs_root *root;
- struct reloc_control *rc;
+ struct btrfs_root *root = pending->root;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
- root = pending->root;
- if (!root->reloc_root)
+ if (!rc || !have_reloc_root(root))
return;
- rc = root->fs_info->reloc_ctl;
if (!rc->merge_reloc_tree)
return;
@@ -4703,10 +4790,10 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *reloc_root;
struct btrfs_root *new_root;
- struct reloc_control *rc;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
int ret;
- if (!root->reloc_root)
+ if (!rc || !have_reloc_root(root))
return 0;
rc = root->fs_info->reloc_ctl;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 893d12fbfda0..612411c74550 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -9,6 +9,8 @@
#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
+#include "qgroup.h"
+#include "space-info.h"
/*
* Read a root item from the tree. In case we detect a root item smaller then
@@ -132,16 +134,17 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
return -ENOMEM;
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
- if (ret < 0) {
- btrfs_abort_transaction(trans, ret);
+ if (ret < 0)
goto out;
- }
- if (ret != 0) {
- btrfs_print_leaf(path->nodes[0]);
- btrfs_crit(fs_info, "unable to update root key %llu %u %llu",
- key->objectid, key->type, key->offset);
- BUG_ON(1);
+ if (ret > 0) {
+ btrfs_crit(fs_info,
+ "unable to find root key (%llu %u %llu) in tree %llu",
+ key->objectid, key->type, key->offset,
+ root->root_key.objectid);
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
}
l = path->nodes[0];
@@ -373,11 +376,13 @@ again:
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
-
- WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
- WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
ptr = (unsigned long)(ref + 1);
- WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
+ if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
+ (btrfs_root_ref_name_len(leaf, ref) != name_len) ||
+ memcmp_extent_buffer(leaf, name, ptr, name_len)) {
+ err = -ENOENT;
+ goto out;
+ }
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
@@ -496,3 +501,57 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
spin_unlock(&root->root_item_lock);
}
+
+/*
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * use_global_rsv: allow fallback to the global block reservation
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reservation mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv, int items,
+ bool use_global_rsv)
+{
+ u64 qgroup_num_bytes = 0;
+ u64 num_bytes;
+ int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+
+ if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ /* One for parent inode, two for dir entries */
+ qgroup_num_bytes = 3 * fs_info->nodesize;
+ ret = btrfs_qgroup_reserve_meta_prealloc(root,
+ qgroup_num_bytes, true);
+ if (ret)
+ return ret;
+ }
+
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, items);
+ rsv->space_info = btrfs_find_space_info(fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+ ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
+
+ if (ret == -ENOSPC && use_global_rsv)
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
+
+ if (ret && qgroup_num_bytes)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+
+ return ret;
+}
+
+void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv)
+{
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
+}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a99588536c79..fd266a2d15ec 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -6,6 +6,7 @@
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
+#include <crypto/hash.h>
#include "ctree.h"
#include "volumes.h"
#include "disk-io.h"
@@ -17,6 +18,7 @@
#include "check-integrity.h"
#include "rcu-string.h"
#include "raid56.h"
+#include "block-group.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -387,8 +389,7 @@ static struct full_stripe_lock *search_full_stripe_lock(
*
* Caller must ensure @cache is a RAID56 block group.
*/
-static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
- u64 bytenr)
+static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
{
u64 ret;
@@ -402,8 +403,8 @@ static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
* round_down() can only handle power of 2, while RAID56 full
* stripe length can be 64KiB * n, so we need to manually round down.
*/
- ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
- cache->full_stripe_len + cache->key.objectid;
+ ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
+ cache->full_stripe_len + cache->start;
return ret;
}
@@ -421,7 +422,7 @@ static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
bool *locked_ret)
{
- struct btrfs_block_group_cache *bg_cache;
+ struct btrfs_block_group *bg_cache;
struct btrfs_full_stripe_locks_tree *locks_root;
struct full_stripe_lock *existing;
u64 fstripe_start;
@@ -468,7 +469,7 @@ out:
static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
bool locked)
{
- struct btrfs_block_group_cache *bg_cache;
+ struct btrfs_block_group *bg_cache;
struct btrfs_full_stripe_locks_tree *locks_root;
struct full_stripe_lock *fstripe_lock;
u64 fstripe_start;
@@ -596,8 +597,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sbio->index = i;
sbio->sctx = sctx;
sbio->page_count = 0;
- btrfs_init_work(&sbio->work, btrfs_scrub_helper,
- scrub_bio_end_io_worker, NULL, NULL);
+ btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
+ NULL);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
@@ -1718,8 +1719,7 @@ static void scrub_wr_bio_end_io(struct bio *bio)
sbio->status = bio->bi_status;
sbio->bio = bio;
- btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
- scrub_wr_bio_end_io_worker, NULL, NULL);
+ btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
@@ -1787,11 +1787,12 @@ static int scrub_checksum(struct scrub_block *sblock)
static int scrub_checksum_data(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 csum[BTRFS_CSUM_SIZE];
u8 *on_disk_csum;
struct page *page;
void *buffer;
- u32 crc = ~(u32)0;
u64 len;
int index;
@@ -1799,6 +1800,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
if (!sblock->pagev[0]->have_csum)
return 0;
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+
on_disk_csum = sblock->pagev[0]->csum;
page = sblock->pagev[0]->page;
buffer = kmap_atomic(page);
@@ -1808,7 +1812,7 @@ static int scrub_checksum_data(struct scrub_block *sblock)
for (;;) {
u64 l = min_t(u64, len, PAGE_SIZE);
- crc = btrfs_csum_data(buffer, crc, l);
+ crypto_shash_update(shash, buffer, l);
kunmap_atomic(buffer);
len -= l;
if (len == 0)
@@ -1820,7 +1824,7 @@ static int scrub_checksum_data(struct scrub_block *sblock)
buffer = kmap_atomic(page);
}
- btrfs_csum_final(crc, csum);
+ crypto_shash_final(shash, csum);
if (memcmp(csum, on_disk_csum, sctx->csum_size))
sblock->checksum_error = 1;
@@ -1832,16 +1836,19 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_header *h;
struct btrfs_fs_info *fs_info = sctx->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
struct page *page;
void *mapped_buffer;
u64 mapped_size;
void *p;
- u32 crc = ~(u32)0;
u64 len;
int index;
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+
BUG_ON(sblock->page_count < 1);
page = sblock->pagev[0]->page;
mapped_buffer = kmap_atomic(page);
@@ -1875,7 +1882,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
for (;;) {
u64 l = min_t(u64, len, mapped_size);
- crc = btrfs_csum_data(p, crc, l);
+ crypto_shash_update(shash, p, l);
kunmap_atomic(mapped_buffer);
len -= l;
if (len == 0)
@@ -1889,7 +1896,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
p = mapped_buffer;
}
- btrfs_csum_final(crc, calculated_csum);
+ crypto_shash_final(shash, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
sblock->checksum_error = 1;
@@ -1900,18 +1907,22 @@ static int scrub_checksum_super(struct scrub_block *sblock)
{
struct btrfs_super_block *s;
struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
struct page *page;
void *mapped_buffer;
u64 mapped_size;
void *p;
- u32 crc = ~(u32)0;
int fail_gen = 0;
int fail_cor = 0;
u64 len;
int index;
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+
BUG_ON(sblock->page_count < 1);
page = sblock->pagev[0]->page;
mapped_buffer = kmap_atomic(page);
@@ -1934,7 +1945,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
for (;;) {
u64 l = min_t(u64, len, mapped_size);
- crc = btrfs_csum_data(p, crc, l);
+ crypto_shash_update(shash, p, l);
kunmap_atomic(mapped_buffer);
len -= l;
if (len == 0)
@@ -1948,7 +1959,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
p = mapped_buffer;
}
- btrfs_csum_final(crc, calculated_csum);
+ crypto_shash_final(shash, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
++fail_cor;
@@ -2136,14 +2147,13 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
scrub_write_block_to_dev_replace(sblock);
}
- scrub_block_put(sblock);
-
if (sctx->is_dev_replace && sctx->flush_all_writes) {
mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
}
+ scrub_block_put(sblock);
scrub_pending_bio_dec(sctx);
}
@@ -2191,8 +2201,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
raid56_add_scrub_pages(rbio, spage->page, spage->logical);
}
- btrfs_init_work(&sblock->work, btrfs_scrub_helper,
- scrub_missing_raid56_worker, NULL, NULL);
+ btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
scrub_block_get(sblock);
scrub_pending_bio_inc(sctx);
raid56_submit_missing_rbio(rbio);
@@ -2448,7 +2457,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
ASSERT(index < UINT_MAX);
num_sectors = sum->len / sctx->fs_info->sectorsize;
- memcpy(csum, sum->sums + index, sctx->csum_size);
+ memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
if (index == num_sectors - 1) {
list_del(&sum->list);
kfree(sum);
@@ -2660,18 +2669,18 @@ static int get_raid56_logic_offset(u64 physical, int num,
u64 last_offset;
u32 stripe_index;
u32 rot;
+ const int data_stripes = nr_data_stripes(map);
- last_offset = (physical - map->stripes[num].physical) *
- nr_data_stripes(map);
+ last_offset = (physical - map->stripes[num].physical) * data_stripes;
if (stripe_start)
*stripe_start = last_offset;
*offset = last_offset;
- for (i = 0; i < nr_data_stripes(map); i++) {
+ for (i = 0; i < data_stripes; i++) {
*offset = last_offset + i * map->stripe_len;
stripe_nr = div64_u64(*offset, map->stripe_len);
- stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
+ stripe_nr = div_u64(stripe_nr, data_stripes);
/* Work out the disk rotation on this stripe-set */
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
@@ -2730,8 +2739,8 @@ static void scrub_parity_bio_endio(struct bio *bio)
bio_put(bio);
- btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
- scrub_parity_bio_endio_worker, NULL, NULL);
+ btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
+ NULL);
btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
}
@@ -3079,7 +3088,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
offset = map->stripe_len * (num / map->sub_stripes);
increment = map->stripe_len * factor;
mirror_num = num % map->sub_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
@@ -3407,18 +3416,18 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
u64 chunk_offset, u64 length,
u64 dev_offset,
- struct btrfs_block_group_cache *cache)
+ struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
struct extent_map *em;
int i;
int ret = 0;
- read_lock(&map_tree->map_tree.lock);
- em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
- read_unlock(&map_tree->map_tree.lock);
+ read_lock(&map_tree->lock);
+ em = lookup_extent_mapping(map_tree, chunk_offset, 1);
+ read_unlock(&map_tree->lock);
if (!em) {
/*
@@ -3471,7 +3480,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
path = btrfs_alloc_path();
@@ -3550,55 +3559,45 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* -> btrfs_scrub_pause()
*/
scrub_pause_on(fs_info);
- ret = btrfs_inc_block_group_ro(cache);
- if (!ret && sctx->is_dev_replace) {
- /*
- * If we are doing a device replace wait for any tasks
- * that started delalloc right before we set the block
- * group to RO mode, as they might have just allocated
- * an extent from it or decided they could do a nocow
- * write. And if any such tasks did that, wait for their
- * ordered extents to complete and then commit the
- * current transaction, so that we can later see the new
- * extent items in the extent tree - the ordered extents
- * create delayed data references (for cow writes) when
- * they complete, which will be run and insert the
- * corresponding extent items into the extent tree when
- * we commit the transaction they used when running
- * inode.c:btrfs_finish_ordered_io(). We later use
- * the commit root of the extent tree to find extents
- * to copy from the srcdev into the tgtdev, and we don't
- * want to miss any new extents.
- */
- btrfs_wait_block_group_reservations(cache);
- btrfs_wait_nocow_writers(cache);
- ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
- cache->key.objectid,
- cache->key.offset);
- if (ret > 0) {
- struct btrfs_trans_handle *trans;
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- ret = PTR_ERR(trans);
- else
- ret = btrfs_commit_transaction(trans);
- if (ret) {
- scrub_pause_off(fs_info);
- btrfs_put_block_group(cache);
- break;
- }
- }
- }
- scrub_pause_off(fs_info);
+ /*
+ * Don't do chunk preallocation for scrub.
+ *
+ * This is especially important for SYSTEM bgs, or we can hit
+ * -EFBIG from btrfs_finish_chunk_alloc() like:
+ * 1. The only SYSTEM bg is marked RO.
+ * Since SYSTEM bg is small, that's pretty common.
+ * 2. New SYSTEM bg will be allocated
+ * Due to regular version will allocate new chunk.
+ * 3. New SYSTEM bg is empty and will get cleaned up
+ * Before cleanup really happens, it's marked RO again.
+ * 4. Empty SYSTEM bg get scrubbed
+ * We go back to 2.
+ *
+ * This can easily boost the amount of SYSTEM chunks if cleaner
+ * thread can't be triggered fast enough, and use up all space
+ * of btrfs_super_block::sys_chunk_array
+ *
+ * While for dev replace, we need to try our best to mark block
+ * group RO, to prevent race between:
+ * - Write duplication
+ * Contains latest data
+ * - Scrub copy
+ * Contains data from commit tree
+ *
+ * If target block group is not marked RO, nocow writes can
+ * be overwritten by scrub copy, causing data corruption.
+ * So for dev-replace, it's not allowed to continue if a block
+ * group is not RO.
+ */
+ ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
if (ret == 0) {
ro_set = 1;
- } else if (ret == -ENOSPC) {
+ } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
/*
* btrfs_inc_block_group_ro return -ENOSPC when it
* failed in creating new chunk for metadata.
- * It is not a problem for scrub/replace, because
+ * It is not a problem for scrub, because
* metadata are always cowed, and our scrub paused
* commit_transactions.
*/
@@ -3607,10 +3606,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
btrfs_warn(fs_info,
"failed setting block group ro: %d", ret);
btrfs_put_block_group(cache);
+ scrub_pause_off(fs_info);
break;
}
- down_write(&fs_info->dev_replace.rwsem);
+ /*
+ * Now the target block is marked RO, wait for nocow writes to
+ * finish before dev-replace.
+ * COW is fine, as COW never overwrites extents in commit tree.
+ */
+ if (sctx->is_dev_replace) {
+ btrfs_wait_nocow_writers(cache);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
+ cache->length);
+ }
+
+ scrub_pause_off(fs_info);
+ down_write(&dev_replace->rwsem);
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
@@ -3651,10 +3663,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
- down_write(&fs_info->dev_replace.rwsem);
+ down_write(&dev_replace->rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
- up_write(&fs_info->dev_replace.rwsem);
+ up_write(&dev_replace->rwsem);
if (ro_set)
btrfs_dec_block_group_ro(cache);
@@ -3668,7 +3680,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
spin_lock(&cache->lock);
if (!cache->removed && !cache->ro && cache->reserved == 0 &&
- btrfs_block_group_used(&cache->item) == 0) {
+ cache->used == 0) {
spin_unlock(&cache->lock);
btrfs_mark_bg_unused(cache);
} else {
@@ -3791,7 +3803,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
struct btrfs_workqueue *scrub_parity = NULL;
if (btrfs_fs_closing(fs_info))
- return -EINVAL;
+ return -EAGAIN;
if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
/*
@@ -3999,9 +4011,9 @@ int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
- struct btrfs_device *dev)
+int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
struct scrub_ctx *sctx;
mutex_lock(&fs_info->scrub_lock);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7ea2d6b1f170..091e5bc8c7ea 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -25,6 +25,14 @@
#include "compression.h"
/*
+ * Maximum number of references an extent can have in order for us to attempt to
+ * issue clone operations instead of write operations. This currently exists to
+ * avoid hitting limitations of the backreference walking code (taking a lot of
+ * time and using too much memory for extents with large number of references).
+ */
+#define SEND_MAX_EXTENT_REFS 64
+
+/*
* A fs_path is a helper to dynamically build path names with unknown size.
* It reallocates the internal buffer on demand.
* It allows fast adding of path elements on the right side (normal path) and
@@ -260,6 +268,21 @@ struct name_cache_entry {
char name[];
};
+#define ADVANCE 1
+#define ADVANCE_ONLY_NEXT -1
+
+enum btrfs_compare_tree_result {
+ BTRFS_COMPARE_TREE_NEW,
+ BTRFS_COMPARE_TREE_DELETED,
+ BTRFS_COMPARE_TREE_CHANGED,
+ BTRFS_COMPARE_TREE_SAME,
+};
+typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ struct btrfs_key *key,
+ enum btrfs_compare_tree_result result,
+ void *ctx);
+
__cold
static void inconsistent_snapshot_error(struct send_ctx *sctx,
enum btrfs_compare_tree_result result,
@@ -686,7 +709,7 @@ static int send_cmd(struct send_ctx *sctx)
hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
hdr->crc = 0;
- crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+ crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
hdr->crc = cpu_to_le32(crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -1160,7 +1183,6 @@ out:
struct backref_ctx {
struct send_ctx *sctx;
- struct btrfs_path *path;
/* number of total found references */
u64 found;
@@ -1213,8 +1235,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
{
struct backref_ctx *bctx = ctx_;
struct clone_root *found;
- int ret;
- u64 i_size;
/* First check if the root is in the list of accepted clone sources */
found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
@@ -1231,30 +1251,25 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
}
/*
- * There are inodes that have extents that lie behind its i_size. Don't
- * accept clones from these extents.
- */
- ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
- NULL, NULL, NULL);
- btrfs_release_path(bctx->path);
- if (ret < 0)
- return ret;
-
- if (offset + bctx->data_offset + bctx->extent_len > i_size)
- return 0;
-
- /*
* Make sure we don't consider clones from send_root that are
* behind the current inode/offset.
*/
if (found->root == bctx->sctx->send_root) {
/*
- * TODO for the moment we don't accept clones from the inode
- * that is currently send. We may change this when
- * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
- * file.
+ * If the source inode was not yet processed we can't issue a
+ * clone operation, as the source extent does not exist yet at
+ * the destination of the stream.
+ */
+ if (ino > bctx->cur_objectid)
+ return 0;
+ /*
+ * We clone from the inode currently being sent as long as the
+ * source extent is already processed, otherwise we could try
+ * to clone from an extent that does not exist yet at the
+ * destination of the stream.
*/
- if (ino >= bctx->cur_objectid)
+ if (ino == bctx->cur_objectid &&
+ offset >= bctx->sctx->cur_inode_next_write_offset)
return 0;
}
@@ -1303,6 +1318,7 @@ static int find_extent_clone(struct send_ctx *sctx,
struct clone_root *cur_clone_root;
struct btrfs_key found_key;
struct btrfs_path *tmp_path;
+ struct btrfs_extent_item *ei;
int compressed;
u32 i;
@@ -1319,8 +1335,6 @@ static int find_extent_clone(struct send_ctx *sctx,
goto out;
}
- backref_ctx->path = tmp_path;
-
if (data_offset >= ino_size) {
/*
* There may be extents that lie behind the file's size.
@@ -1352,7 +1366,6 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = extent_from_logical(fs_info, disk_byte, tmp_path,
&found_key, &flags);
up_read(&fs_info->commit_root_sem);
- btrfs_release_path(tmp_path);
if (ret < 0)
goto out;
@@ -1361,6 +1374,21 @@ static int find_extent_clone(struct send_ctx *sctx,
goto out;
}
+ ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
+ struct btrfs_extent_item);
+ /*
+ * Backreference walking (iterate_extent_inodes() below) is currently
+ * too expensive when an extent has a large number of references, both
+ * in time spent and used memory. So for now just fallback to write
+ * operations instead of clone operations when an extent has more than
+ * a certain amount of references.
+ */
+ if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
+ ret = -ENOENT;
+ goto out;
+ }
+ btrfs_release_path(tmp_path);
+
/*
* Setup the clone roots.
*/
@@ -4782,7 +4810,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -5017,6 +5045,12 @@ static int send_hole(struct send_ctx *sctx, u64 end)
if (offset >= sctx->cur_inode_size)
return 0;
+ /*
+ * Don't go beyond the inode's i_size due to prealloc extents that start
+ * after the i_size.
+ */
+ end = min_t(u64, end, sctx->cur_inode_size);
+
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, end - offset);
@@ -5082,6 +5116,7 @@ static int clone_range(struct send_ctx *sctx,
struct btrfs_path *path;
struct btrfs_key key;
int ret;
+ u64 clone_src_i_size = 0;
/*
* Prevent cloning from a zero offset with a length matching the sector
@@ -5107,6 +5142,16 @@ static int clone_range(struct send_ctx *sctx,
return -ENOMEM;
/*
+ * There are inodes that have extents that lie behind its i_size. Don't
+ * accept clones from these extents.
+ */
+ ret = __get_inode_info(clone_root->root, path, clone_root->ino,
+ &clone_src_i_size, NULL, NULL, NULL, NULL, NULL);
+ btrfs_release_path(path);
+ if (ret < 0)
+ goto out;
+
+ /*
* We can't send a clone operation for the entire range if we find
* extent items in the respective range in the source file that
* refer to different extents or if we find holes.
@@ -5148,6 +5193,7 @@ static int clone_range(struct send_ctx *sctx,
u8 type;
u64 ext_len;
u64 clone_len;
+ u64 clone_data_offset;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(clone_root->root, path);
@@ -5201,13 +5247,73 @@ static int clone_range(struct send_ctx *sctx,
if (key.offset >= clone_root->offset + len)
break;
+ if (key.offset >= clone_src_i_size)
+ break;
+
+ if (key.offset + ext_len > clone_src_i_size)
+ ext_len = clone_src_i_size - key.offset;
+
+ clone_data_offset = btrfs_file_extent_offset(leaf, ei);
+ if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
+ clone_root->offset = key.offset;
+ if (clone_data_offset < data_offset &&
+ clone_data_offset + ext_len > data_offset) {
+ u64 extent_offset;
+
+ extent_offset = data_offset - clone_data_offset;
+ ext_len -= extent_offset;
+ clone_data_offset += extent_offset;
+ clone_root->offset += extent_offset;
+ }
+ }
+
clone_len = min_t(u64, ext_len, len);
if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
- btrfs_file_extent_offset(leaf, ei) == data_offset)
- ret = send_clone(sctx, offset, clone_len, clone_root);
- else
+ clone_data_offset == data_offset) {
+ const u64 src_end = clone_root->offset + clone_len;
+ const u64 sectorsize = SZ_64K;
+
+ /*
+ * We can't clone the last block, when its size is not
+ * sector size aligned, into the middle of a file. If we
+ * do so, the receiver will get a failure (-EINVAL) when
+ * trying to clone or will silently corrupt the data in
+ * the destination file if it's on a kernel without the
+ * fix introduced by commit ac765f83f1397646
+ * ("Btrfs: fix data corruption due to cloning of eof
+ * block).
+ *
+ * So issue a clone of the aligned down range plus a
+ * regular write for the eof block, if we hit that case.
+ *
+ * Also, we use the maximum possible sector size, 64K,
+ * because we don't know what's the sector size of the
+ * filesystem that receives the stream, so we have to
+ * assume the largest possible sector size.
+ */
+ if (src_end == clone_src_i_size &&
+ !IS_ALIGNED(src_end, sectorsize) &&
+ offset + clone_len < sctx->cur_inode_size) {
+ u64 slen;
+
+ slen = ALIGN_DOWN(src_end - clone_root->offset,
+ sectorsize);
+ if (slen > 0) {
+ ret = send_clone(sctx, offset, slen,
+ clone_root);
+ if (ret < 0)
+ goto out;
+ }
+ ret = send_extent_data(sctx, offset + slen,
+ clone_len - slen);
+ } else {
+ ret = send_clone(sctx, offset, clone_len,
+ clone_root);
+ }
+ } else {
ret = send_extent_data(sctx, offset, clone_len);
+ }
if (ret < 0)
goto out;
@@ -6262,68 +6368,21 @@ static int changed_extent(struct send_ctx *sctx,
{
int ret = 0;
- if (sctx->cur_ino != sctx->cmp_key->objectid) {
-
- if (result == BTRFS_COMPARE_TREE_CHANGED) {
- struct extent_buffer *leaf_l;
- struct extent_buffer *leaf_r;
- struct btrfs_file_extent_item *ei_l;
- struct btrfs_file_extent_item *ei_r;
-
- leaf_l = sctx->left_path->nodes[0];
- leaf_r = sctx->right_path->nodes[0];
- ei_l = btrfs_item_ptr(leaf_l,
- sctx->left_path->slots[0],
- struct btrfs_file_extent_item);
- ei_r = btrfs_item_ptr(leaf_r,
- sctx->right_path->slots[0],
- struct btrfs_file_extent_item);
-
- /*
- * We may have found an extent item that has changed
- * only its disk_bytenr field and the corresponding
- * inode item was not updated. This case happens due to
- * very specific timings during relocation when a leaf
- * that contains file extent items is COWed while
- * relocation is ongoing and its in the stage where it
- * updates data pointers. So when this happens we can
- * safely ignore it since we know it's the same extent,
- * but just at different logical and physical locations
- * (when an extent is fully replaced with a new one, we
- * know the generation number must have changed too,
- * since snapshot creation implies committing the current
- * transaction, and the inode item must have been updated
- * as well).
- * This replacement of the disk_bytenr happens at
- * relocation.c:replace_file_extents() through
- * relocation.c:btrfs_reloc_cow_block().
- */
- if (btrfs_file_extent_generation(leaf_l, ei_l) ==
- btrfs_file_extent_generation(leaf_r, ei_r) &&
- btrfs_file_extent_ram_bytes(leaf_l, ei_l) ==
- btrfs_file_extent_ram_bytes(leaf_r, ei_r) &&
- btrfs_file_extent_compression(leaf_l, ei_l) ==
- btrfs_file_extent_compression(leaf_r, ei_r) &&
- btrfs_file_extent_encryption(leaf_l, ei_l) ==
- btrfs_file_extent_encryption(leaf_r, ei_r) &&
- btrfs_file_extent_other_encoding(leaf_l, ei_l) ==
- btrfs_file_extent_other_encoding(leaf_r, ei_r) &&
- btrfs_file_extent_type(leaf_l, ei_l) ==
- btrfs_file_extent_type(leaf_r, ei_r) &&
- btrfs_file_extent_disk_bytenr(leaf_l, ei_l) !=
- btrfs_file_extent_disk_bytenr(leaf_r, ei_r) &&
- btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) ==
- btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) &&
- btrfs_file_extent_offset(leaf_l, ei_l) ==
- btrfs_file_extent_offset(leaf_r, ei_r) &&
- btrfs_file_extent_num_bytes(leaf_l, ei_l) ==
- btrfs_file_extent_num_bytes(leaf_r, ei_r))
- return 0;
- }
-
- inconsistent_snapshot_error(sctx, result, "extent");
- return -EIO;
- }
+ /*
+ * We have found an extent item that changed without the inode item
+ * having changed. This can happen either after relocation (where the
+ * disk_bytenr of an extent item is replaced at
+ * relocation.c:replace_file_extents()) or after deduplication into a
+ * file in both the parent and send snapshots (where an extent item can
+ * get modified or replaced with a new one). Note that deduplication
+ * updates the inode item, but it only changes the iversion (sequence
+ * field in the inode item) of the inode, so if a file is deduplicated
+ * the same amount of times in both the parent and send snapshots, its
+ * iversion becames the same in both snapshots, whence the inode item is
+ * the same on both snapshots.
+ */
+ if (sctx->cur_ino != sctx->cmp_key->objectid)
+ return 0;
if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
if (result != BTRFS_COMPARE_TREE_DELETED)
@@ -6501,6 +6560,366 @@ out:
return ret;
}
+static int tree_move_down(struct btrfs_path *path, int *level)
+{
+ struct extent_buffer *eb;
+
+ BUG_ON(*level == 0);
+ eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]);
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
+
+ path->nodes[*level - 1] = eb;
+ path->slots[*level - 1] = 0;
+ (*level)--;
+ return 0;
+}
+
+static int tree_move_next_or_upnext(struct btrfs_path *path,
+ int *level, int root_level)
+{
+ int ret = 0;
+ int nritems;
+ nritems = btrfs_header_nritems(path->nodes[*level]);
+
+ path->slots[*level]++;
+
+ while (path->slots[*level] >= nritems) {
+ if (*level == root_level)
+ return -1;
+
+ /* move upnext */
+ path->slots[*level] = 0;
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ (*level)++;
+ path->slots[*level]++;
+
+ nritems = btrfs_header_nritems(path->nodes[*level]);
+ ret = 1;
+ }
+ return ret;
+}
+
+/*
+ * Returns 1 if it had to move up and next. 0 is returned if it moved only next
+ * or down.
+ */
+static int tree_advance(struct btrfs_path *path,
+ int *level, int root_level,
+ int allow_down,
+ struct btrfs_key *key)
+{
+ int ret;
+
+ if (*level == 0 || !allow_down) {
+ ret = tree_move_next_or_upnext(path, level, root_level);
+ } else {
+ ret = tree_move_down(path, level);
+ }
+ if (ret >= 0) {
+ if (*level == 0)
+ btrfs_item_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ else
+ btrfs_node_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ }
+ return ret;
+}
+
+static int tree_compare_item(struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ char *tmp_buf)
+{
+ int cmp;
+ int len1, len2;
+ unsigned long off1, off2;
+
+ len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
+ len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+ if (len1 != len2)
+ return 1;
+
+ off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
+ off2 = btrfs_item_ptr_offset(right_path->nodes[0],
+ right_path->slots[0]);
+
+ read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
+
+ cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
+ if (cmp)
+ return 1;
+ return 0;
+}
+
+/*
+ * This function compares two trees and calls the provided callback for
+ * every changed/new/deleted item it finds.
+ * If shared tree blocks are encountered, whole subtrees are skipped, making
+ * the compare pretty fast on snapshotted subvolumes.
+ *
+ * This currently works on commit roots only. As commit roots are read only,
+ * we don't do any locking. The commit roots are protected with transactions.
+ * Transactions are ended and rejoined when a commit is tried in between.
+ *
+ * This function checks for modifications done to the trees while comparing.
+ * If it detects a change, it aborts immediately.
+ */
+static int btrfs_compare_trees(struct btrfs_root *left_root,
+ struct btrfs_root *right_root,
+ btrfs_changed_cb_t changed_cb, void *ctx)
+{
+ struct btrfs_fs_info *fs_info = left_root->fs_info;
+ int ret;
+ int cmp;
+ struct btrfs_path *left_path = NULL;
+ struct btrfs_path *right_path = NULL;
+ struct btrfs_key left_key;
+ struct btrfs_key right_key;
+ char *tmp_buf = NULL;
+ int left_root_level;
+ int right_root_level;
+ int left_level;
+ int right_level;
+ int left_end_reached;
+ int right_end_reached;
+ int advance_left;
+ int advance_right;
+ u64 left_blockptr;
+ u64 right_blockptr;
+ u64 left_gen;
+ u64 right_gen;
+
+ left_path = btrfs_alloc_path();
+ if (!left_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ right_path = btrfs_alloc_path();
+ if (!right_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
+ if (!tmp_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ left_path->search_commit_root = 1;
+ left_path->skip_locking = 1;
+ right_path->search_commit_root = 1;
+ right_path->skip_locking = 1;
+
+ /*
+ * Strategy: Go to the first items of both trees. Then do
+ *
+ * If both trees are at level 0
+ * Compare keys of current items
+ * If left < right treat left item as new, advance left tree
+ * and repeat
+ * If left > right treat right item as deleted, advance right tree
+ * and repeat
+ * If left == right do deep compare of items, treat as changed if
+ * needed, advance both trees and repeat
+ * If both trees are at the same level but not at level 0
+ * Compare keys of current nodes/leafs
+ * If left < right advance left tree and repeat
+ * If left > right advance right tree and repeat
+ * If left == right compare blockptrs of the next nodes/leafs
+ * If they match advance both trees but stay at the same level
+ * and repeat
+ * If they don't match advance both trees while allowing to go
+ * deeper and repeat
+ * If tree levels are different
+ * Advance the tree that needs it and repeat
+ *
+ * Advancing a tree means:
+ * If we are at level 0, try to go to the next slot. If that's not
+ * possible, go one level up and repeat. Stop when we found a level
+ * where we could go to the next slot. We may at this point be on a
+ * node or a leaf.
+ *
+ * If we are not at level 0 and not on shared tree blocks, go one
+ * level deeper.
+ *
+ * If we are not at level 0 and on shared tree blocks, go one slot to
+ * the right if possible or go up and right.
+ */
+
+ down_read(&fs_info->commit_root_sem);
+ left_level = btrfs_header_level(left_root->commit_root);
+ left_root_level = left_level;
+ left_path->nodes[left_level] =
+ btrfs_clone_extent_buffer(left_root->commit_root);
+ if (!left_path->nodes[left_level]) {
+ up_read(&fs_info->commit_root_sem);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ right_level = btrfs_header_level(right_root->commit_root);
+ right_root_level = right_level;
+ right_path->nodes[right_level] =
+ btrfs_clone_extent_buffer(right_root->commit_root);
+ if (!right_path->nodes[right_level]) {
+ up_read(&fs_info->commit_root_sem);
+ ret = -ENOMEM;
+ goto out;
+ }
+ up_read(&fs_info->commit_root_sem);
+
+ if (left_level == 0)
+ btrfs_item_key_to_cpu(left_path->nodes[left_level],
+ &left_key, left_path->slots[left_level]);
+ else
+ btrfs_node_key_to_cpu(left_path->nodes[left_level],
+ &left_key, left_path->slots[left_level]);
+ if (right_level == 0)
+ btrfs_item_key_to_cpu(right_path->nodes[right_level],
+ &right_key, right_path->slots[right_level]);
+ else
+ btrfs_node_key_to_cpu(right_path->nodes[right_level],
+ &right_key, right_path->slots[right_level]);
+
+ left_end_reached = right_end_reached = 0;
+ advance_left = advance_right = 0;
+
+ while (1) {
+ cond_resched();
+ if (advance_left && !left_end_reached) {
+ ret = tree_advance(left_path, &left_level,
+ left_root_level,
+ advance_left != ADVANCE_ONLY_NEXT,
+ &left_key);
+ if (ret == -1)
+ left_end_reached = ADVANCE;
+ else if (ret < 0)
+ goto out;
+ advance_left = 0;
+ }
+ if (advance_right && !right_end_reached) {
+ ret = tree_advance(right_path, &right_level,
+ right_root_level,
+ advance_right != ADVANCE_ONLY_NEXT,
+ &right_key);
+ if (ret == -1)
+ right_end_reached = ADVANCE;
+ else if (ret < 0)
+ goto out;
+ advance_right = 0;
+ }
+
+ if (left_end_reached && right_end_reached) {
+ ret = 0;
+ goto out;
+ } else if (left_end_reached) {
+ if (right_level == 0) {
+ ret = changed_cb(left_path, right_path,
+ &right_key,
+ BTRFS_COMPARE_TREE_DELETED,
+ ctx);
+ if (ret < 0)
+ goto out;
+ }
+ advance_right = ADVANCE;
+ continue;
+ } else if (right_end_reached) {
+ if (left_level == 0) {
+ ret = changed_cb(left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_NEW,
+ ctx);
+ if (ret < 0)
+ goto out;
+ }
+ advance_left = ADVANCE;
+ continue;
+ }
+
+ if (left_level == 0 && right_level == 0) {
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+ if (cmp < 0) {
+ ret = changed_cb(left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_NEW,
+ ctx);
+ if (ret < 0)
+ goto out;
+ advance_left = ADVANCE;
+ } else if (cmp > 0) {
+ ret = changed_cb(left_path, right_path,
+ &right_key,
+ BTRFS_COMPARE_TREE_DELETED,
+ ctx);
+ if (ret < 0)
+ goto out;
+ advance_right = ADVANCE;
+ } else {
+ enum btrfs_compare_tree_result result;
+
+ WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
+ ret = tree_compare_item(left_path, right_path,
+ tmp_buf);
+ if (ret)
+ result = BTRFS_COMPARE_TREE_CHANGED;
+ else
+ result = BTRFS_COMPARE_TREE_SAME;
+ ret = changed_cb(left_path, right_path,
+ &left_key, result, ctx);
+ if (ret < 0)
+ goto out;
+ advance_left = ADVANCE;
+ advance_right = ADVANCE;
+ }
+ } else if (left_level == right_level) {
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+ if (cmp < 0) {
+ advance_left = ADVANCE;
+ } else if (cmp > 0) {
+ advance_right = ADVANCE;
+ } else {
+ left_blockptr = btrfs_node_blockptr(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_blockptr = btrfs_node_blockptr(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ left_gen = btrfs_node_ptr_generation(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_gen = btrfs_node_ptr_generation(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ if (left_blockptr == right_blockptr &&
+ left_gen == right_gen) {
+ /*
+ * As we're on a shared block, don't
+ * allow to go deeper.
+ */
+ advance_left = ADVANCE_ONLY_NEXT;
+ advance_right = ADVANCE_ONLY_NEXT;
+ } else {
+ advance_left = ADVANCE;
+ advance_right = ADVANCE;
+ }
+ }
+ } else if (left_level < right_level) {
+ advance_right = ADVANCE;
+ } else {
+ advance_left = ADVANCE;
+ }
+ }
+
+out:
+ btrfs_free_path(left_path);
+ btrfs_free_path(right_path);
+ kvfree(tmp_buf);
+ return ret;
+}
+
static int send_subvol(struct send_ctx *sctx)
{
int ret;
@@ -6579,6 +6998,38 @@ commit_trans:
return btrfs_commit_transaction(trans);
}
+/*
+ * Make sure any existing dellaloc is flushed for any root used by a send
+ * operation so that we do not miss any data and we do not race with writeback
+ * finishing and changing a tree while send is using the tree. This could
+ * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
+ * a send operation then uses the subvolume.
+ * After flushing delalloc ensure_commit_roots_uptodate() must be called.
+ */
+static int flush_delalloc_roots(struct send_ctx *sctx)
+{
+ struct btrfs_root *root = sctx->parent_root;
+ int ret;
+ int i;
+
+ if (root) {
+ ret = btrfs_start_delalloc_snapshot(root);
+ if (ret)
+ return ret;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+ }
+
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
+ root = sctx->clone_roots[i].root;
+ ret = btrfs_start_delalloc_snapshot(root);
+ if (ret)
+ return ret;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+ }
+
+ return 0;
+}
+
static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
{
spin_lock(&root->root_item_lock);
@@ -6594,6 +7045,13 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
spin_unlock(&root->root_item_lock);
}
+static void dedupe_in_progress_warn(const struct btrfs_root *root)
+{
+ btrfs_warn_rl(root->fs_info,
+"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
+ root->root_key.objectid, root->dedupe_in_progress);
+}
+
long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
@@ -6617,16 +7075,15 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
* making it RW. This also protects against deletion.
*/
spin_lock(&send_root->root_item_lock);
+ if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
+ dedupe_in_progress_warn(send_root);
+ spin_unlock(&send_root->root_item_lock);
+ return -EAGAIN;
+ }
send_root->send_in_progress++;
spin_unlock(&send_root->root_item_lock);
/*
- * This is done when we lookup the root, it should already be complete
- * by the time we get here.
- */
- WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
-
- /*
* Userspace tools do the checks and warn the user if it's
* not RO.
*/
@@ -6751,6 +7208,13 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
ret = -EPERM;
goto out;
}
+ if (clone_root->dedupe_in_progress) {
+ dedupe_in_progress_warn(clone_root);
+ spin_unlock(&clone_root->root_item_lock);
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = -EAGAIN;
+ goto out;
+ }
clone_root->send_in_progress++;
spin_unlock(&clone_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -6785,6 +7249,13 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
ret = -EPERM;
goto out;
}
+ if (sctx->parent_root->dedupe_in_progress) {
+ dedupe_in_progress_warn(sctx->parent_root);
+ spin_unlock(&sctx->parent_root->root_item_lock);
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = -EAGAIN;
+ goto out;
+ }
spin_unlock(&sctx->parent_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -6803,13 +7274,31 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
NULL);
sort_clone_roots = 1;
+ ret = flush_delalloc_roots(sctx);
+ if (ret)
+ goto out;
+
ret = ensure_commit_roots_uptodate(sctx);
if (ret)
goto out;
+ mutex_lock(&fs_info->balance_mutex);
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ btrfs_warn_rl(fs_info,
+ "cannot run send because a balance operation is in progress");
+ ret = -EAGAIN;
+ goto out;
+ }
+ fs_info->send_in_progress++;
+ mutex_unlock(&fs_info->balance_mutex);
+
current->journal_info = BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
current->journal_info = NULL;
+ mutex_lock(&fs_info->balance_mutex);
+ fs_info->send_in_progress--;
+ mutex_unlock(&fs_info->balance_mutex);
if (ret < 0)
goto out;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
new file mode 100644
index 000000000000..f09aa6ee9113
--- /dev/null
+++ b/fs/btrfs/space-info.c
@@ -0,0 +1,1115 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "misc.h"
+#include "ctree.h"
+#include "space-info.h"
+#include "sysfs.h"
+#include "volumes.h"
+#include "free-space-cache.h"
+#include "ordered-data.h"
+#include "transaction.h"
+#include "block-group.h"
+
+u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
+ bool may_use_included)
+{
+ ASSERT(s_info);
+ return s_info->bytes_used + s_info->bytes_reserved +
+ s_info->bytes_pinned + s_info->bytes_readonly +
+ (may_use_included ? s_info->bytes_may_use : 0);
+}
+
+/*
+ * after adding space to the filesystem, we need to clear the full flags
+ * on all the space infos.
+ */
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list)
+ found->full = 0;
+ rcu_read_unlock();
+}
+
+static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+{
+
+ struct btrfs_space_info *space_info;
+ int i;
+ int ret;
+
+ space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
+ if (!space_info)
+ return -ENOMEM;
+
+ ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
+ GFP_KERNEL);
+ if (ret) {
+ kfree(space_info);
+ return ret;
+ }
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ INIT_LIST_HEAD(&space_info->block_groups[i]);
+ init_rwsem(&space_info->groups_sem);
+ spin_lock_init(&space_info->lock);
+ space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+ INIT_LIST_HEAD(&space_info->ro_bgs);
+ INIT_LIST_HEAD(&space_info->tickets);
+ INIT_LIST_HEAD(&space_info->priority_tickets);
+
+ ret = btrfs_sysfs_add_space_info_type(info, space_info);
+ if (ret)
+ return ret;
+
+ list_add_rcu(&space_info->list, &info->space_info);
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ info->data_sinfo = space_info;
+
+ return ret;
+}
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+ u64 flags;
+ int mixed = 0;
+ int ret;
+
+ disk_super = fs_info->super_copy;
+ if (!btrfs_super_root(disk_super))
+ return -EINVAL;
+
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ mixed = 1;
+
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ ret = create_space_info(fs_info, flags);
+ if (ret)
+ goto out;
+
+ if (mixed) {
+ flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
+ ret = create_space_info(fs_info, flags);
+ } else {
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ ret = create_space_info(fs_info, flags);
+ if (ret)
+ goto out;
+
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ ret = create_space_info(fs_info, flags);
+ }
+out:
+ return ret;
+}
+
+void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
+ u64 total_bytes, u64 bytes_used,
+ u64 bytes_readonly,
+ struct btrfs_space_info **space_info)
+{
+ struct btrfs_space_info *found;
+ int factor;
+
+ factor = btrfs_bg_type_to_factor(flags);
+
+ found = btrfs_find_space_info(info, flags);
+ ASSERT(found);
+ spin_lock(&found->lock);
+ found->total_bytes += total_bytes;
+ found->disk_total += total_bytes * factor;
+ found->bytes_used += bytes_used;
+ found->disk_used += bytes_used * factor;
+ found->bytes_readonly += bytes_readonly;
+ if (total_bytes > 0)
+ found->full = 0;
+ btrfs_try_granting_tickets(info, found);
+ spin_unlock(&found->lock);
+ *space_info = found;
+}
+
+struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
+ u64 flags)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list) {
+ if (found->flags & flags) {
+ rcu_read_unlock();
+ return found;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
+{
+ return (global->size << 1);
+}
+
+static int can_overcommit(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush,
+ bool system_chunk)
+{
+ u64 profile;
+ u64 avail;
+ u64 used;
+ int factor;
+
+ /* Don't overcommit when in mixed mode. */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
+ return 0;
+
+ if (system_chunk)
+ profile = btrfs_system_alloc_profile(fs_info);
+ else
+ profile = btrfs_metadata_alloc_profile(fs_info);
+
+ used = btrfs_space_info_used(space_info, true);
+ avail = atomic64_read(&fs_info->free_chunk_space);
+
+ /*
+ * If we have dup, raid1 or raid10 then only half of the free
+ * space is actually usable. For raid56, the space info used
+ * doesn't include the parity drive, so we don't have to
+ * change the math
+ */
+ factor = btrfs_bg_type_to_factor(profile);
+ avail = div_u64(avail, factor);
+
+ /*
+ * If we aren't flushing all things, let us overcommit up to
+ * 1/2th of the space. If we can flush, don't let us overcommit
+ * too much, let it overcommit up to 1/8 of the space.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ avail >>= 3;
+ else
+ avail >>= 1;
+
+ if (used + bytes < space_info->total_bytes + avail)
+ return 1;
+ return 0;
+}
+
+/*
+ * This is for space we already have accounted in space_info->bytes_may_use, so
+ * basically when we're returning space from block_rsv's.
+ */
+void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ struct list_head *head;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
+
+ lockdep_assert_held(&space_info->lock);
+
+ head = &space_info->priority_tickets;
+again:
+ while (!list_empty(head)) {
+ struct reserve_ticket *ticket;
+ u64 used = btrfs_space_info_used(space_info, true);
+
+ ticket = list_first_entry(head, struct reserve_ticket, list);
+
+ /* Check and see if our ticket can be satisified now. */
+ if ((used + ticket->bytes <= space_info->total_bytes) ||
+ can_overcommit(fs_info, space_info, ticket->bytes, flush,
+ false)) {
+ btrfs_space_info_update_bytes_may_use(fs_info,
+ space_info,
+ ticket->bytes);
+ list_del_init(&ticket->list);
+ ticket->bytes = 0;
+ space_info->tickets_id++;
+ wake_up(&ticket->wait);
+ } else {
+ break;
+ }
+ }
+
+ if (head == &space_info->priority_tickets) {
+ head = &space_info->tickets;
+ flush = BTRFS_RESERVE_FLUSH_ALL;
+ goto again;
+ }
+}
+
+#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
+do { \
+ struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
+ spin_lock(&__rsv->lock); \
+ btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
+ __rsv->size, __rsv->reserved); \
+ spin_unlock(&__rsv->lock); \
+} while (0)
+
+static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *info)
+{
+ lockdep_assert_held(&info->lock);
+
+ btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
+ info->flags,
+ info->total_bytes - btrfs_space_info_used(info, true),
+ info->full ? "" : "not ");
+ btrfs_info(fs_info,
+ "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
+ info->total_bytes, info->bytes_used, info->bytes_pinned,
+ info->bytes_reserved, info->bytes_may_use,
+ info->bytes_readonly);
+
+ DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+
+}
+
+void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups)
+{
+ struct btrfs_block_group *cache;
+ int index = 0;
+
+ spin_lock(&info->lock);
+ __btrfs_dump_space_info(fs_info, info);
+ spin_unlock(&info->lock);
+
+ if (!dump_block_groups)
+ return;
+
+ down_read(&info->groups_sem);
+again:
+ list_for_each_entry(cache, &info->block_groups[index], list) {
+ spin_lock(&cache->lock);
+ btrfs_info(fs_info,
+ "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
+ cache->start, cache->length, cache->used, cache->pinned,
+ cache->reserved, cache->ro ? "[readonly]" : "");
+ btrfs_dump_free_space(cache, bytes);
+ spin_unlock(&cache->lock);
+ }
+ if (++index < BTRFS_NR_RAID_TYPES)
+ goto again;
+ up_read(&info->groups_sem);
+}
+
+static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
+ unsigned long nr_pages, int nr_items)
+{
+ struct super_block *sb = fs_info->sb;
+
+ if (down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
+ up_read(&sb->s_umount);
+ } else {
+ /*
+ * We needn't worry the filesystem going from r/w to r/o though
+ * we don't acquire ->s_umount mutex, because the filesystem
+ * should guarantee the delalloc inodes list be empty after
+ * the filesystem is readonly(all dirty pages are written to
+ * the disk).
+ */
+ btrfs_start_delalloc_roots(fs_info, nr_items);
+ if (!current->journal_info)
+ btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
+ }
+}
+
+static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
+ u64 to_reclaim)
+{
+ u64 bytes;
+ u64 nr;
+
+ bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ nr = div64_u64(to_reclaim, bytes);
+ if (!nr)
+ nr = 1;
+ return nr;
+}
+
+#define EXTENT_SIZE_PER_ITEM SZ_256K
+
+/*
+ * shrink metadata reservation for delalloc
+ */
+static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
+ u64 orig, bool wait_ordered)
+{
+ struct btrfs_space_info *space_info;
+ struct btrfs_trans_handle *trans;
+ u64 delalloc_bytes;
+ u64 dio_bytes;
+ u64 async_pages;
+ u64 items;
+ long time_left;
+ unsigned long nr_pages;
+ int loops;
+
+ /* Calc the number of the pages we need flush for space reservation */
+ items = calc_reclaim_items_nr(fs_info, to_reclaim);
+ to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+
+ trans = (struct btrfs_trans_handle *)current->journal_info;
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+ delalloc_bytes = percpu_counter_sum_positive(
+ &fs_info->delalloc_bytes);
+ dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
+ if (delalloc_bytes == 0 && dio_bytes == 0) {
+ if (trans)
+ return;
+ if (wait_ordered)
+ btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ return;
+ }
+
+ /*
+ * If we are doing more ordered than delalloc we need to just wait on
+ * ordered extents, otherwise we'll waste time trying to flush delalloc
+ * that likely won't give us the space back we need.
+ */
+ if (dio_bytes > delalloc_bytes)
+ wait_ordered = true;
+
+ loops = 0;
+ while ((delalloc_bytes || dio_bytes) && loops < 3) {
+ nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+
+ /*
+ * Triggers inode writeback for up to nr_pages. This will invoke
+ * ->writepages callback and trigger delalloc filling
+ * (btrfs_run_delalloc_range()).
+ */
+ btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+
+ /*
+ * We need to wait for the compressed pages to start before
+ * we continue.
+ */
+ async_pages = atomic_read(&fs_info->async_delalloc_pages);
+ if (!async_pages)
+ goto skip_async;
+
+ /*
+ * Calculate how many compressed pages we want to be written
+ * before we continue. I.e if there are more async pages than we
+ * require wait_event will wait until nr_pages are written.
+ */
+ if (async_pages <= nr_pages)
+ async_pages = 0;
+ else
+ async_pages -= nr_pages;
+
+ wait_event(fs_info->async_submit_wait,
+ atomic_read(&fs_info->async_delalloc_pages) <=
+ (int)async_pages);
+skip_async:
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets) &&
+ list_empty(&space_info->priority_tickets)) {
+ spin_unlock(&space_info->lock);
+ break;
+ }
+ spin_unlock(&space_info->lock);
+
+ loops++;
+ if (wait_ordered && !trans) {
+ btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ } else {
+ time_left = schedule_timeout_killable(1);
+ if (time_left)
+ break;
+ }
+ delalloc_bytes = percpu_counter_sum_positive(
+ &fs_info->delalloc_bytes);
+ dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
+ }
+}
+
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
+ *
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does. Otherwise it
+ * will return -ENOSPC.
+ */
+static int may_commit_transaction(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ struct reserve_ticket *ticket = NULL;
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_trans_handle *trans;
+ u64 bytes_needed;
+ u64 reclaim_bytes = 0;
+ u64 cur_free_bytes = 0;
+
+ trans = (struct btrfs_trans_handle *)current->journal_info;
+ if (trans)
+ return -EAGAIN;
+
+ spin_lock(&space_info->lock);
+ cur_free_bytes = btrfs_space_info_used(space_info, true);
+ if (cur_free_bytes < space_info->total_bytes)
+ cur_free_bytes = space_info->total_bytes - cur_free_bytes;
+ else
+ cur_free_bytes = 0;
+
+ if (!list_empty(&space_info->priority_tickets))
+ ticket = list_first_entry(&space_info->priority_tickets,
+ struct reserve_ticket, list);
+ else if (!list_empty(&space_info->tickets))
+ ticket = list_first_entry(&space_info->tickets,
+ struct reserve_ticket, list);
+ bytes_needed = (ticket) ? ticket->bytes : 0;
+
+ if (bytes_needed > cur_free_bytes)
+ bytes_needed -= cur_free_bytes;
+ else
+ bytes_needed = 0;
+ spin_unlock(&space_info->lock);
+
+ if (!bytes_needed)
+ return 0;
+
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /*
+ * See if there is enough pinned space to make this reservation, or if
+ * we have block groups that are going to be freed, allowing us to
+ * possibly do a chunk allocation the next loop through.
+ */
+ if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
+ __percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes_needed,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
+ goto commit;
+
+ /*
+ * See if there is some space in the delayed insertion reservation for
+ * this reservation.
+ */
+ if (space_info != delayed_rsv->space_info)
+ goto enospc;
+
+ spin_lock(&delayed_rsv->lock);
+ reclaim_bytes += delayed_rsv->reserved;
+ spin_unlock(&delayed_rsv->lock);
+
+ spin_lock(&delayed_refs_rsv->lock);
+ reclaim_bytes += delayed_refs_rsv->reserved;
+ spin_unlock(&delayed_refs_rsv->lock);
+ if (reclaim_bytes >= bytes_needed)
+ goto commit;
+ bytes_needed -= reclaim_bytes;
+
+ if (__percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes_needed,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
+ goto enospc;
+
+commit:
+ return btrfs_commit_transaction(trans);
+enospc:
+ btrfs_end_transaction(trans);
+ return -ENOSPC;
+}
+
+/*
+ * Try to flush some data based on policy set by @state. This is only advisory
+ * and may fail for various reasons. The caller is supposed to examine the
+ * state of @space_info to detect the outcome.
+ */
+static void flush_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 num_bytes,
+ int state)
+{
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_trans_handle *trans;
+ int nr;
+ int ret = 0;
+
+ switch (state) {
+ case FLUSH_DELAYED_ITEMS_NR:
+ case FLUSH_DELAYED_ITEMS:
+ if (state == FLUSH_DELAYED_ITEMS_NR)
+ nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
+ else
+ nr = -1;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = btrfs_run_delayed_items_nr(trans, nr);
+ btrfs_end_transaction(trans);
+ break;
+ case FLUSH_DELALLOC:
+ case FLUSH_DELALLOC_WAIT:
+ shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
+ state == FLUSH_DELALLOC_WAIT);
+ break;
+ case FLUSH_DELAYED_REFS_NR:
+ case FLUSH_DELAYED_REFS:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ if (state == FLUSH_DELAYED_REFS_NR)
+ nr = calc_reclaim_items_nr(fs_info, num_bytes);
+ else
+ nr = 0;
+ btrfs_run_delayed_refs(trans, nr);
+ btrfs_end_transaction(trans);
+ break;
+ case ALLOC_CHUNK:
+ case ALLOC_CHUNK_FORCE:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = btrfs_chunk_alloc(trans,
+ btrfs_metadata_alloc_profile(fs_info),
+ (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
+ CHUNK_ALLOC_FORCE);
+ btrfs_end_transaction(trans);
+ if (ret > 0 || ret == -ENOSPC)
+ ret = 0;
+ break;
+ case RUN_DELAYED_IPUTS:
+ /*
+ * If we have pending delayed iputs then we could free up a
+ * bunch of pinned space, so make sure we run the iputs before
+ * we do our pinned bytes check below.
+ */
+ btrfs_run_delayed_iputs(fs_info);
+ btrfs_wait_on_delayed_iputs(fs_info);
+ break;
+ case COMMIT_TRANS:
+ ret = may_commit_transaction(fs_info, space_info);
+ break;
+ default:
+ ret = -ENOSPC;
+ break;
+ }
+
+ trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
+ ret);
+ return;
+}
+
+static inline u64
+btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ bool system_chunk)
+{
+ struct reserve_ticket *ticket;
+ u64 used;
+ u64 expected;
+ u64 to_reclaim = 0;
+
+ list_for_each_entry(ticket, &space_info->tickets, list)
+ to_reclaim += ticket->bytes;
+ list_for_each_entry(ticket, &space_info->priority_tickets, list)
+ to_reclaim += ticket->bytes;
+ if (to_reclaim)
+ return to_reclaim;
+
+ to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
+ if (can_overcommit(fs_info, space_info, to_reclaim,
+ BTRFS_RESERVE_FLUSH_ALL, system_chunk))
+ return 0;
+
+ used = btrfs_space_info_used(space_info, true);
+
+ if (can_overcommit(fs_info, space_info, SZ_1M,
+ BTRFS_RESERVE_FLUSH_ALL, system_chunk))
+ expected = div_factor_fine(space_info->total_bytes, 95);
+ else
+ expected = div_factor_fine(space_info->total_bytes, 90);
+
+ if (used > expected)
+ to_reclaim = used - expected;
+ else
+ to_reclaim = 0;
+ to_reclaim = min(to_reclaim, space_info->bytes_may_use +
+ space_info->bytes_reserved);
+ return to_reclaim;
+}
+
+static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 used, bool system_chunk)
+{
+ u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+
+ /* If we're just plain full then async reclaim just slows us down. */
+ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
+ return 0;
+
+ if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+ system_chunk))
+ return 0;
+
+ return (used >= thresh && !btrfs_fs_closing(fs_info) &&
+ !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+}
+
+/*
+ * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
+ * @fs_info - fs_info for this fs
+ * @space_info - the space info we were flushing
+ *
+ * We call this when we've exhausted our flushing ability and haven't made
+ * progress in satisfying tickets. The reservation code handles tickets in
+ * order, so if there is a large ticket first and then smaller ones we could
+ * very well satisfy the smaller tickets. This will attempt to wake up any
+ * tickets in the list to catch this case.
+ *
+ * This function returns true if it was able to make progress by clearing out
+ * other tickets, or if it stumbles across a ticket that was smaller than the
+ * first ticket.
+ */
+static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ struct reserve_ticket *ticket;
+ u64 tickets_id = space_info->tickets_id;
+ u64 first_ticket_bytes = 0;
+
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
+ __btrfs_dump_space_info(fs_info, space_info);
+ }
+
+ while (!list_empty(&space_info->tickets) &&
+ tickets_id == space_info->tickets_id) {
+ ticket = list_first_entry(&space_info->tickets,
+ struct reserve_ticket, list);
+
+ /*
+ * may_commit_transaction will avoid committing the transaction
+ * if it doesn't feel like the space reclaimed by the commit
+ * would result in the ticket succeeding. However if we have a
+ * smaller ticket in the queue it may be small enough to be
+ * satisified by committing the transaction, so if any
+ * subsequent ticket is smaller than the first ticket go ahead
+ * and send us back for another loop through the enospc flushing
+ * code.
+ */
+ if (first_ticket_bytes == 0)
+ first_ticket_bytes = ticket->bytes;
+ else if (first_ticket_bytes > ticket->bytes)
+ return true;
+
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_info(fs_info, "failing ticket with %llu bytes",
+ ticket->bytes);
+
+ list_del_init(&ticket->list);
+ ticket->error = -ENOSPC;
+ wake_up(&ticket->wait);
+
+ /*
+ * We're just throwing tickets away, so more flushing may not
+ * trip over btrfs_try_granting_tickets, so we need to call it
+ * here to see if we can make progress with the next ticket in
+ * the list.
+ */
+ btrfs_try_granting_tickets(fs_info, space_info);
+ }
+ return (tickets_id != space_info->tickets_id);
+}
+
+/*
+ * This is for normal flushers, we can wait all goddamned day if we want to. We
+ * will loop and continuously try to flush as long as we are making progress.
+ * We count progress as clearing off tickets each time we have to loop.
+ */
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 to_reclaim;
+ int flush_state;
+ int commit_cycles = 0;
+ u64 last_tickets_id;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+ spin_lock(&space_info->lock);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+ false);
+ if (!to_reclaim) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ do {
+ flush_space(fs_info, space_info, to_reclaim, flush_state);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
+ space_info,
+ false);
+ if (last_tickets_id == space_info->tickets_id) {
+ flush_state++;
+ } else {
+ last_tickets_id = space_info->tickets_id;
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ if (commit_cycles)
+ commit_cycles--;
+ }
+
+ /*
+ * We don't want to force a chunk allocation until we've tried
+ * pretty hard to reclaim space. Think of the case where we
+ * freed up a bunch of space and so have a lot of pinned space
+ * to reclaim. We would rather use that than possibly create a
+ * underutilized metadata chunk. So if this is our first run
+ * through the flushing state machine skip ALLOC_CHUNK_FORCE and
+ * commit the transaction. If nothing has changed the next go
+ * around then we can force a chunk allocation.
+ */
+ if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
+ flush_state++;
+
+ if (flush_state > COMMIT_TRANS) {
+ commit_cycles++;
+ if (commit_cycles > 2) {
+ if (maybe_fail_all_tickets(fs_info, space_info)) {
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ commit_cycles--;
+ } else {
+ space_info->flush = 0;
+ }
+ } else {
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ }
+ }
+ spin_unlock(&space_info->lock);
+ } while (flush_state <= COMMIT_TRANS);
+}
+
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+ INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+
+static const enum btrfs_flush_state priority_flush_states[] = {
+ FLUSH_DELAYED_ITEMS_NR,
+ FLUSH_DELAYED_ITEMS,
+ ALLOC_CHUNK,
+};
+
+static const enum btrfs_flush_state evict_flush_states[] = {
+ FLUSH_DELAYED_ITEMS_NR,
+ FLUSH_DELAYED_ITEMS,
+ FLUSH_DELAYED_REFS_NR,
+ FLUSH_DELAYED_REFS,
+ FLUSH_DELALLOC,
+ FLUSH_DELALLOC_WAIT,
+ ALLOC_CHUNK,
+ COMMIT_TRANS,
+};
+
+static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket,
+ const enum btrfs_flush_state *states,
+ int states_nr)
+{
+ u64 to_reclaim;
+ int flush_state;
+
+ spin_lock(&space_info->lock);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
+ false);
+ if (!to_reclaim) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ spin_unlock(&space_info->lock);
+
+ flush_state = 0;
+ do {
+ flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
+ flush_state++;
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ spin_unlock(&space_info->lock);
+ } while (flush_state < states_nr);
+}
+
+static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+
+{
+ DEFINE_WAIT(wait);
+ int ret = 0;
+
+ spin_lock(&space_info->lock);
+ while (ticket->bytes > 0 && ticket->error == 0) {
+ ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
+ if (ret) {
+ /*
+ * Delete us from the list. After we unlock the space
+ * info, we don't want the async reclaim job to reserve
+ * space for this ticket. If that would happen, then the
+ * ticket's task would not known that space was reserved
+ * despite getting an error, resulting in a space leak
+ * (bytes_may_use counter of our space_info).
+ */
+ list_del_init(&ticket->list);
+ ticket->error = -EINTR;
+ break;
+ }
+ spin_unlock(&space_info->lock);
+
+ schedule();
+
+ finish_wait(&ticket->wait, &wait);
+ spin_lock(&space_info->lock);
+ }
+ spin_unlock(&space_info->lock);
+}
+
+/**
+ * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
+ * @fs_info - the fs
+ * @space_info - the space_info for the reservation
+ * @ticket - the ticket for the reservation
+ * @flush - how much we can flush
+ *
+ * This does the work of figuring out how to flush for the ticket, waiting for
+ * the reservation, and returning the appropriate error if there is one.
+ */
+static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket,
+ enum btrfs_reserve_flush_enum flush)
+{
+ int ret;
+
+ switch (flush) {
+ case BTRFS_RESERVE_FLUSH_ALL:
+ wait_reserve_ticket(fs_info, space_info, ticket);
+ break;
+ case BTRFS_RESERVE_FLUSH_LIMIT:
+ priority_reclaim_metadata_space(fs_info, space_info, ticket,
+ priority_flush_states,
+ ARRAY_SIZE(priority_flush_states));
+ break;
+ case BTRFS_RESERVE_FLUSH_EVICT:
+ priority_reclaim_metadata_space(fs_info, space_info, ticket,
+ evict_flush_states,
+ ARRAY_SIZE(evict_flush_states));
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ spin_lock(&space_info->lock);
+ ret = ticket->error;
+ if (ticket->bytes || ticket->error) {
+ /*
+ * Need to delete here for priority tickets. For regular tickets
+ * either the async reclaim job deletes the ticket from the list
+ * or we delete it ourselves at wait_reserve_ticket().
+ */
+ list_del_init(&ticket->list);
+ if (!ret)
+ ret = -ENOSPC;
+ }
+ spin_unlock(&space_info->lock);
+ ASSERT(list_empty(&ticket->list));
+ /*
+ * Check that we can't have an error set if the reservation succeeded,
+ * as that would confuse tasks and lead them to error out without
+ * releasing reserved space (if an error happens the expectation is that
+ * space wasn't reserved at all).
+ */
+ ASSERT(!(ticket->bytes == 0 && ticket->error));
+ return ret;
+}
+
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @space_info - the space info we want to allocate from
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
+ *
+ * This will reserve orig_bytes number of bytes from the space info associated
+ * with the block_rsv. If there is not enough space it will make an attempt to
+ * flush out space to make room. It will do this by flushing delalloc if
+ * possible or committing the transaction. If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush,
+ bool system_chunk)
+{
+ struct reserve_ticket ticket;
+ u64 used;
+ int ret = 0;
+ bool pending_tickets;
+
+ ASSERT(orig_bytes);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+
+ spin_lock(&space_info->lock);
+ ret = -ENOSPC;
+ used = btrfs_space_info_used(space_info, true);
+ pending_tickets = !list_empty(&space_info->tickets) ||
+ !list_empty(&space_info->priority_tickets);
+
+ /*
+ * Carry on if we have enough space (short-circuit) OR call
+ * can_overcommit() to ensure we can overcommit to continue.
+ */
+ if (!pending_tickets &&
+ ((used + orig_bytes <= space_info->total_bytes) ||
+ can_overcommit(fs_info, space_info, orig_bytes, flush,
+ system_chunk))) {
+ btrfs_space_info_update_bytes_may_use(fs_info, space_info,
+ orig_bytes);
+ ret = 0;
+ }
+
+ /*
+ * If we couldn't make a reservation then setup our reservation ticket
+ * and kick the async worker if it's not already running.
+ *
+ * If we are a priority flusher then we just need to add our ticket to
+ * the list and we will do our own flushing further down.
+ */
+ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+ ticket.bytes = orig_bytes;
+ ticket.error = 0;
+ init_waitqueue_head(&ticket.wait);
+ if (flush == BTRFS_RESERVE_FLUSH_ALL) {
+ list_add_tail(&ticket.list, &space_info->tickets);
+ if (!space_info->flush) {
+ space_info->flush = 1;
+ trace_btrfs_trigger_flush(fs_info,
+ space_info->flags,
+ orig_bytes, flush,
+ "enospc");
+ queue_work(system_unbound_wq,
+ &fs_info->async_reclaim_work);
+ }
+ } else {
+ list_add_tail(&ticket.list,
+ &space_info->priority_tickets);
+ }
+ } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ used += orig_bytes;
+ /*
+ * We will do the space reservation dance during log replay,
+ * which means we won't have fs_info->fs_root set, so don't do
+ * the async reclaim as we will panic.
+ */
+ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
+ need_do_async_reclaim(fs_info, space_info,
+ used, system_chunk) &&
+ !work_busy(&fs_info->async_reclaim_work)) {
+ trace_btrfs_trigger_flush(fs_info, space_info->flags,
+ orig_bytes, flush, "preempt");
+ queue_work(system_unbound_wq,
+ &fs_info->async_reclaim_work);
+ }
+ }
+ spin_unlock(&space_info->lock);
+ if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
+ return ret;
+
+ return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
+}
+
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
+ *
+ * This will reserve orig_bytes number of bytes from the space info associated
+ * with the block_rsv. If there is not enough space it will make an attempt to
+ * flush out space to make room. It will do this by flushing delalloc if
+ * possible or committing the transaction. If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ int ret;
+ bool system_chunk = (root == fs_info->chunk_root);
+
+ ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ orig_bytes, flush, system_chunk);
+ if (ret == -ENOSPC &&
+ unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+ if (block_rsv != global_rsv &&
+ !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
+ ret = 0;
+ }
+ if (ret == -ENOSPC) {
+ trace_btrfs_space_reservation(fs_info, "space_info:enospc",
+ block_rsv->space_info->flags,
+ orig_bytes, 1);
+
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_dump_space_info(fs_info, block_rsv->space_info,
+ orig_bytes, 0);
+ }
+ return ret;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
new file mode 100644
index 000000000000..1a349e3f9cc1
--- /dev/null
+++ b/fs/btrfs/space-info.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SPACE_INFO_H
+#define BTRFS_SPACE_INFO_H
+
+struct btrfs_space_info {
+ spinlock_t lock;
+
+ u64 total_bytes; /* total bytes in the space,
+ this doesn't take mirrors into account */
+ u64 bytes_used; /* total bytes used,
+ this doesn't take mirrors into account */
+ u64 bytes_pinned; /* total bytes pinned, will be freed when the
+ transaction finishes */
+ u64 bytes_reserved; /* total bytes the allocator has reserved for
+ current allocations */
+ u64 bytes_may_use; /* number of bytes that may be used for
+ delalloc/allocations */
+ u64 bytes_readonly; /* total bytes that are read only */
+
+ u64 max_extent_size; /* This will hold the maximum extent size of
+ the space info if we had an ENOSPC in the
+ allocator. */
+
+ unsigned int full:1; /* indicates that we cannot allocate any more
+ chunks for this space */
+ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
+
+ unsigned int flush:1; /* set if we are trying to make space */
+
+ unsigned int force_alloc; /* set if we need to force a chunk
+ alloc for this space */
+
+ u64 disk_used; /* total bytes used on disk */
+ u64 disk_total; /* total bytes on disk, takes mirrors into
+ account */
+
+ u64 flags;
+
+ /*
+ * bytes_pinned is kept in line with what is actually pinned, as in
+ * we've called update_block_group and dropped the bytes_used counter
+ * and increased the bytes_pinned counter. However this means that
+ * bytes_pinned does not reflect the bytes that will be pinned once the
+ * delayed refs are flushed, so this counter is inc'ed every time we
+ * call btrfs_free_extent so it is a realtime count of what will be
+ * freed once the transaction is committed. It will be zeroed every
+ * time the transaction commits.
+ */
+ struct percpu_counter total_bytes_pinned;
+
+ struct list_head list;
+ /* Protected by the spinlock 'lock'. */
+ struct list_head ro_bgs;
+ struct list_head priority_tickets;
+ struct list_head tickets;
+ /*
+ * tickets_id just indicates the next ticket will be handled, so note
+ * it's not stored per ticket.
+ */
+ u64 tickets_id;
+
+ struct rw_semaphore groups_sem;
+ /* for block groups in our same type */
+ struct list_head block_groups[BTRFS_NR_RAID_TYPES];
+
+ struct kobject kobj;
+ struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
+};
+
+struct reserve_ticket {
+ u64 bytes;
+ int error;
+ struct list_head list;
+ wait_queue_head_t wait;
+};
+
+static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+{
+ return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+ (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
+}
+
+/*
+ *
+ * Declare a helper function to detect underflow of various space info members
+ */
+#define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \
+static inline void \
+btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \
+ struct btrfs_space_info *sinfo, \
+ s64 bytes) \
+{ \
+ const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \
+ lockdep_assert_held(&sinfo->lock); \
+ trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \
+ trace_btrfs_space_reservation(fs_info, trace_name, \
+ sinfo->flags, abs_bytes, \
+ bytes > 0); \
+ if (bytes < 0 && sinfo->name < -bytes) { \
+ WARN_ON(1); \
+ sinfo->name = 0; \
+ return; \
+ } \
+ sinfo->name += bytes; \
+}
+
+DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
+DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
+ u64 total_bytes, u64 bytes_used,
+ u64 bytes_readonly,
+ struct btrfs_space_info **space_info);
+struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
+ u64 flags);
+u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
+ bool may_use_included);
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups);
+int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush);
+void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info);
+
+static inline void btrfs_space_info_free_bytes_may_use(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes)
+{
+ spin_lock(&space_info->lock);
+ btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+}
+
+#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 4c13b737f568..73f7987143df 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -33,6 +33,8 @@ static inline void put_unaligned_le8(u8 val, void *p)
*
* The extent buffer api is used to do the page spanning work required to
* have a metadata blocksize different from the page size.
+ *
+ * There are 2 variants defined, one with a token pointer and one without.
*/
#define DEFINE_BTRFS_SETGET_BITS(bits) \
@@ -50,8 +52,10 @@ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \
int size = sizeof(u##bits); \
u##bits res; \
\
- if (token && token->kaddr && token->offset <= offset && \
- token->eb == eb && \
+ ASSERT(token); \
+ ASSERT(token->eb == eb); \
+ \
+ if (token->kaddr && token->offset <= offset && \
(token->offset + PAGE_SIZE >= offset + size)) { \
kaddr = token->kaddr; \
p = kaddr + part_offset - token->offset; \
@@ -68,11 +72,33 @@ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \
} \
p = kaddr + part_offset - map_start; \
res = get_unaligned_le##bits(p + off); \
- if (token) { \
- token->kaddr = kaddr; \
- token->offset = map_start; \
- token->eb = eb; \
+ token->kaddr = kaddr; \
+ token->offset = map_start; \
+ return res; \
+} \
+u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
+ const void *ptr, unsigned long off) \
+{ \
+ unsigned long part_offset = (unsigned long)ptr; \
+ unsigned long offset = part_offset + off; \
+ void *p; \
+ int err; \
+ char *kaddr; \
+ unsigned long map_start; \
+ unsigned long map_len; \
+ int size = sizeof(u##bits); \
+ u##bits res; \
+ \
+ err = map_private_extent_buffer(eb, offset, size, \
+ &kaddr, &map_start, &map_len); \
+ if (err) { \
+ __le##bits leres; \
+ \
+ read_extent_buffer(eb, &leres, offset, size); \
+ return le##bits##_to_cpu(leres); \
} \
+ p = kaddr + part_offset - map_start; \
+ res = get_unaligned_le##bits(p + off); \
return res; \
} \
void btrfs_set_token_##bits(struct extent_buffer *eb, \
@@ -89,8 +115,10 @@ void btrfs_set_token_##bits(struct extent_buffer *eb, \
unsigned long map_len; \
int size = sizeof(u##bits); \
\
- if (token && token->kaddr && token->offset <= offset && \
- token->eb == eb && \
+ ASSERT(token); \
+ ASSERT(token->eb == eb); \
+ \
+ if (token->kaddr && token->offset <= offset && \
(token->offset + PAGE_SIZE >= offset + size)) { \
kaddr = token->kaddr; \
p = kaddr + part_offset - token->offset; \
@@ -108,11 +136,32 @@ void btrfs_set_token_##bits(struct extent_buffer *eb, \
} \
p = kaddr + part_offset - map_start; \
put_unaligned_le##bits(val, p + off); \
- if (token) { \
- token->kaddr = kaddr; \
- token->offset = map_start; \
- token->eb = eb; \
+ token->kaddr = kaddr; \
+ token->offset = map_start; \
+} \
+void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off, u##bits val) \
+{ \
+ unsigned long part_offset = (unsigned long)ptr; \
+ unsigned long offset = part_offset + off; \
+ void *p; \
+ int err; \
+ char *kaddr; \
+ unsigned long map_start; \
+ unsigned long map_len; \
+ int size = sizeof(u##bits); \
+ \
+ err = map_private_extent_buffer(eb, offset, size, \
+ &kaddr, &map_start, &map_len); \
+ if (err) { \
+ __le##bits val2; \
+ \
+ val2 = cpu_to_le##bits(val); \
+ write_extent_buffer(eb, &val2, offset, size); \
+ return; \
} \
+ p = kaddr + part_offset - map_start; \
+ put_unaligned_le##bits(val, p + off); \
}
DEFINE_BTRFS_SETGET_BITS(8)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 120e4340792a..f452a94abdc3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -42,7 +42,10 @@
#include "dev-replace.h"
#include "free-space-cache.h"
#include "backref.h"
+#include "space-info.h"
+#include "sysfs.h"
#include "tests/btrfs-tests.h"
+#include "block-group.h"
#include "qgroup.h"
#define CREATE_TRACE_POINTS
@@ -63,7 +66,7 @@ static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
-const char *btrfs_decode_error(int errno)
+const char * __attribute_const__ btrfs_decode_error(int errno)
{
char *errstr = "unknown";
@@ -184,7 +187,7 @@ static struct ratelimit_state printk_limits[] = {
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
-void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
@@ -1216,7 +1219,7 @@ static int btrfs_fill_super(struct super_block *sb,
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
+ inode = btrfs_iget(sb, &key, fs_info->fs_root);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail_close;
@@ -1400,7 +1403,7 @@ static inline int is_subvolume_inode(struct inode *inode)
}
static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
- const char *device_name, struct vfsmount *mnt)
+ struct vfsmount *mnt)
{
struct dentry *root;
int ret;
@@ -1553,6 +1556,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
btrfs_sb(s)->bdev_holder = fs_type;
+ if (!strstr(crc32c_impl(), "generic"))
+ set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
error = btrfs_fill_super(s, fs_devices, data);
}
if (!error)
@@ -1601,14 +1606,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
{
struct vfsmount *mnt_root;
struct dentry *root;
- fmode_t mode = FMODE_READ;
char *subvol_name = NULL;
u64 subvol_objectid = 0;
int error = 0;
- if (!(flags & SB_RDONLY))
- mode |= FMODE_WRITE;
-
error = btrfs_parse_subvol_options(data, &subvol_name,
&subvol_objectid);
if (error) {
@@ -1649,7 +1650,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
}
/* mount_subvol() will free subvol_name and mnt_root */
- root = mount_subvol(subvol_name, subvol_objectid, device_name, mnt_root);
+ root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
out:
return root;
@@ -1668,7 +1669,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
@@ -1900,12 +1900,12 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
struct btrfs_device_info *devices_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
- u64 skip_space;
u64 type;
u64 avail_space;
u64 min_stripe_size;
- int min_stripes = 1, num_stripes = 1;
+ int num_stripes = 1;
int i = 0, nr_devices;
+ const struct btrfs_raid_attr *rattr;
/*
* We aren't under the device list lock, so this is racy-ish, but good
@@ -1929,21 +1929,21 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
/* calc min stripe number for data space allocation */
type = btrfs_data_alloc_profile(fs_info);
- if (type & BTRFS_BLOCK_GROUP_RAID0) {
- min_stripes = 2;
+ rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)];
+
+ if (type & BTRFS_BLOCK_GROUP_RAID0)
num_stripes = nr_devices;
- } else if (type & BTRFS_BLOCK_GROUP_RAID1) {
- min_stripes = 2;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1)
num_stripes = 2;
- } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
- min_stripes = 4;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1C3)
+ num_stripes = 3;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1C4)
+ num_stripes = 4;
+ else if (type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = 4;
- }
- if (type & BTRFS_BLOCK_GROUP_DUP)
- min_stripe_size = 2 * BTRFS_STRIPE_LEN;
- else
- min_stripe_size = BTRFS_STRIPE_LEN;
+ /* Adjust for more than 1 stripe per device */
+ min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN;
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
@@ -1959,28 +1959,21 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
avail_space = device->total_bytes - device->bytes_used;
/* align with stripe_len */
- avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN);
- avail_space *= BTRFS_STRIPE_LEN;
+ avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);
/*
* In order to avoid overwriting the superblock on the drive,
* btrfs starts at an offset of at least 1MB when doing chunk
* allocation.
+ *
+ * This ensures we have at least min_stripe_size free space
+ * after excluding 1MB.
*/
- skip_space = SZ_1M;
-
- /*
- * we can use the free space in [0, skip_space - 1], subtract
- * it from the total.
- */
- if (avail_space && avail_space >= skip_space)
- avail_space -= skip_space;
- else
- avail_space = 0;
-
- if (avail_space < min_stripe_size)
+ if (avail_space <= SZ_1M + min_stripe_size)
continue;
+ avail_space -= SZ_1M;
+
devices_info[i].dev = device;
devices_info[i].max_avail = avail_space;
@@ -1994,9 +1987,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
i = nr_devices - 1;
avail_space = 0;
- while (nr_devices >= min_stripes) {
- if (num_stripes > nr_devices)
- num_stripes = nr_devices;
+ while (nr_devices >= rattr->devs_min) {
+ num_stripes = min(num_stripes, nr_devices);
if (devices_info[i].max_avail >= min_stripe_size) {
int j;
@@ -2033,7 +2025,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
struct btrfs_super_block *disk_super = fs_info->super_copy;
- struct list_head *head = &fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
@@ -2047,7 +2038,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
int mixed = 0;
rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
+ list_for_each_entry_rcu(found, &fs_info->space_info, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
int i;
@@ -2298,6 +2289,7 @@ static const struct super_operations btrfs_super_ops = {
.show_devname = btrfs_show_devname,
.alloc_inode = btrfs_alloc_inode,
.destroy_inode = btrfs_destroy_inode,
+ .free_inode = btrfs_free_inode,
.statfs = btrfs_statfs,
.remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
@@ -2307,7 +2299,7 @@ static const struct super_operations btrfs_super_ops = {
static const struct file_operations btrfs_ctl_fops = {
.open = btrfs_control_open,
.unlocked_ioctl = btrfs_control_ioctl,
- .compat_ioctl = btrfs_control_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.owner = THIS_MODULE,
.llseek = noop_llseek,
};
@@ -2370,10 +2362,14 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_cachep;
- err = extent_map_init();
+ err = extent_state_cache_init();
if (err)
goto free_extent_io;
+ err = extent_map_init();
+ if (err)
+ goto free_extent_state_cache;
+
err = ordered_data_init();
if (err)
goto free_extent_map;
@@ -2432,6 +2428,8 @@ free_ordered_data:
ordered_data_exit();
free_extent_map:
extent_map_exit();
+free_extent_state_cache:
+ extent_state_cache_exit();
free_extent_io:
extent_io_exit();
free_cachep:
@@ -2452,6 +2450,7 @@ static void __exit exit_btrfs_fs(void)
btrfs_prelim_ref_exit();
ordered_data_exit();
extent_map_exit();
+ extent_state_cache_exit();
extent_io_exit();
btrfs_interface_exit();
btrfs_end_io_wq_exit();
@@ -2465,3 +2464,7 @@ late_initcall(init_btrfs_fs);
module_exit(exit_btrfs_fs)
MODULE_LICENSE("GPL");
+MODULE_SOFTDEP("pre: crc32c");
+MODULE_SOFTDEP("pre: xxhash64");
+MODULE_SOFTDEP("pre: sha256");
+MODULE_SOFTDEP("pre: blake2b-256");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5a5930e3d32b..5ebbe8a5ee76 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -4,22 +4,88 @@
*/
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
-#include <linux/kobject.h>
#include <linux/bug.h>
-#include <linux/debugfs.h>
+#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "sysfs.h"
#include "volumes.h"
+#include "space-info.h"
+#include "block-group.h"
+
+struct btrfs_feature_attr {
+ struct kobj_attribute kobj_attr;
+ enum btrfs_feature_set feature_set;
+ u64 feature_bit;
+};
+
+/* For raid type sysfs entries */
+struct raid_kobject {
+ u64 flags;
+ struct kobject kobj;
+};
+
+#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \
+{ \
+ .attr = { .name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+}
+
+#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \
+ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
+
+#define BTRFS_ATTR(_prefix, _name, _show) \
+ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
+#define BTRFS_ATTR_PTR(_prefix, _name) \
+ (&btrfs_attr_##_prefix##_##_name.attr)
+
+#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \
+static struct btrfs_feature_attr btrfs_attr_features_##_name = { \
+ .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \
+ btrfs_feature_attr_show, \
+ btrfs_feature_attr_store), \
+ .feature_set = _feature_set, \
+ .feature_bit = _feature_prefix ##_## _feature_bit, \
+}
+#define BTRFS_FEAT_ATTR_PTR(_name) \
+ (&btrfs_attr_features_##_name.kobj_attr.attr)
+
+#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
+ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
+ BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature)
+#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
+ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
+static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a)
+{
+ return container_of(a, struct btrfs_feature_attr, kobj_attr);
+}
+
+static struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr)
+{
+ return container_of(attr, struct kobj_attribute, attr);
+}
+
+static struct btrfs_feature_attr *attr_to_btrfs_feature_attr(
+ struct attribute *attr)
+{
+ return to_btrfs_feature_attr(attr_to_btrfs_attr(attr));
+}
+
static u64 get_features(struct btrfs_fs_info *fs_info,
enum btrfs_feature_set set)
{
@@ -193,6 +259,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
+BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -207,6 +274,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(no_holes),
BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
+ BTRFS_FEAT_ATTR_PTR(raid1c34),
NULL
};
@@ -230,8 +298,30 @@ static ssize_t rmdir_subvol_show(struct kobject *kobj,
}
BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
+static ssize_t supported_checksums_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < btrfs_get_num_csums(); i++) {
+ /*
+ * This "trick" only works as long as 'enum btrfs_csum_type' has
+ * no holes in it
+ */
+ ret += snprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ (i == 0 ? "" : " "), btrfs_super_csum_name(i));
+
+ }
+
+ ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
+ BTRFS_ATTR_PTR(static_feature, supported_checksums),
NULL
};
@@ -246,6 +336,25 @@ static const struct attribute_group btrfs_static_feature_attr_group = {
.attrs = btrfs_supported_static_feature_attrs,
};
+#ifdef CONFIG_BTRFS_DEBUG
+
+/*
+ * Runtime debugging exported via sysfs
+ *
+ * /sys/fs/btrfs/debug - applies to module or all filesystems
+ * /sys/fs/btrfs/UUID - applies only to the given filesystem
+ */
+static struct attribute *btrfs_debug_feature_attrs[] = {
+ NULL
+};
+
+static const struct attribute_group btrfs_debug_feature_attr_group = {
+ .name = "debug",
+ .attrs = btrfs_debug_feature_attrs,
+};
+
+#endif
+
static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
{
u64 val;
@@ -288,36 +397,37 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
{
struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags);
u64 val = 0;
down_read(&sinfo->groups_sem);
list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes))
- val += block_group->key.offset;
+ val += block_group->length;
else
- val += btrfs_block_group_used(&block_group->item);
+ val += block_group->used;
}
up_read(&sinfo->groups_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", val);
}
-static struct attribute *raid_attributes[] = {
+static struct attribute *raid_attrs[] = {
BTRFS_ATTR_PTR(raid, total_bytes),
BTRFS_ATTR_PTR(raid, used_bytes),
NULL
};
+ATTRIBUTE_GROUPS(raid);
static void release_raid_kobj(struct kobject *kobj)
{
kfree(to_raid_kobj(kobj));
}
-struct kobj_type btrfs_raid_ktype = {
+static struct kobj_type btrfs_raid_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.release = release_raid_kobj,
- .default_attrs = raid_attributes,
+ .default_groups = raid_groups,
};
#define SPACE_INFO_ATTR(field) \
@@ -364,6 +474,7 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
NULL,
};
+ATTRIBUTE_GROUPS(space_info);
static void space_info_release(struct kobject *kobj)
{
@@ -372,10 +483,10 @@ static void space_info_release(struct kobject *kobj)
kfree(sinfo);
}
-struct kobj_type space_info_ktype = {
+static struct kobj_type space_info_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.release = space_info_release,
- .default_attrs = space_info_attrs,
+ .default_groups = space_info_groups,
};
static const struct attribute *allocation_attrs[] = {
@@ -518,6 +629,19 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
+static ssize_t btrfs_checksum_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
+
+ return snprintf(buf, PAGE_SIZE, "%s (%s)\n",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(fs_info->csum_shash));
+}
+
+BTRFS_ATTR(, checksum, btrfs_checksum_show);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -525,6 +649,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, clone_alignment),
BTRFS_ATTR_PTR(, quota_override),
BTRFS_ATTR_PTR(, metadata_uuid),
+ BTRFS_ATTR_PTR(, checksum),
NULL,
};
@@ -652,12 +777,17 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
}
-const char * const btrfs_feature_set_names[FEAT_MAX] = {
+static const char * const btrfs_feature_set_names[FEAT_MAX] = {
[FEAT_COMPAT] = "compat",
[FEAT_COMPAT_RO] = "compat_ro",
[FEAT_INCOMPAT] = "incompat",
};
+const char * const btrfs_feature_set_name(enum btrfs_feature_set set)
+{
+ return btrfs_feature_set_names[set];
+}
+
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
{
size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */
@@ -727,6 +857,110 @@ static void init_feature_attrs(void)
}
}
+/*
+ * Create a sysfs entry for a given block group type at path
+ * /sys/fs/btrfs/UUID/allocation/data/TYPE
+ */
+void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_space_info *space_info = cache->space_info;
+ struct raid_kobject *rkobj;
+ const int index = btrfs_bg_flags_to_raid_index(cache->flags);
+ unsigned int nofs_flag;
+ int ret;
+
+ /*
+ * Setup a NOFS context because kobject_add(), deep in its call chain,
+ * does GFP_KERNEL allocations, and we are often called in a context
+ * where if reclaim is triggered we can deadlock (we are either holding
+ * a transaction handle or some lock required for a transaction
+ * commit).
+ */
+ nofs_flag = memalloc_nofs_save();
+
+ rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+ if (!rkobj) {
+ memalloc_nofs_restore(nofs_flag);
+ btrfs_warn(cache->fs_info,
+ "couldn't alloc memory for raid level kobject");
+ return;
+ }
+
+ rkobj->flags = cache->flags;
+ kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+ ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s",
+ btrfs_bg_type_to_raid_name(rkobj->flags));
+ memalloc_nofs_restore(nofs_flag);
+ if (ret) {
+ kobject_put(&rkobj->kobj);
+ btrfs_warn(fs_info,
+ "failed to add kobject for block cache, ignoring");
+ return;
+ }
+
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
+}
+
+/*
+ * Remove sysfs directories for all block group types of a given space info and
+ * the space info as well
+ */
+void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ struct kobject *kobj;
+
+ kobj = space_info->block_group_kobjs[i];
+ space_info->block_group_kobjs[i] = NULL;
+ if (kobj) {
+ kobject_del(kobj);
+ kobject_put(kobj);
+ }
+ }
+ kobject_del(&space_info->kobj);
+ kobject_put(&space_info->kobj);
+}
+
+static const char *alloc_name(u64 flags)
+{
+ switch (flags) {
+ case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
+ return "mixed";
+ case BTRFS_BLOCK_GROUP_METADATA:
+ return "metadata";
+ case BTRFS_BLOCK_GROUP_DATA:
+ return "data";
+ case BTRFS_BLOCK_GROUP_SYSTEM:
+ return "system";
+ default:
+ WARN_ON(1);
+ return "invalid-combination";
+ };
+}
+
+/*
+ * Create a sysfs entry for a space info type at path
+ * /sys/fs/btrfs/UUID/allocation/TYPE
+ */
+int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ int ret;
+
+ ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
+ fs_info->space_info_kobj, "%s",
+ alloc_name(space_info->flags));
+ if (ret) {
+ kobject_put(&space_info->kobj);
+ return ret;
+ }
+
+ return 0;
+}
+
/* when one_device is NULL, it removes all device links */
int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
@@ -803,14 +1037,34 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
return error;
}
-/* /sys/fs/btrfs/ entry */
-static struct kset *btrfs_kset;
+void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
+{
+ int ret;
+
+ ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+ if (ret)
+ pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
+ action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+ &disk_to_dev(bdev->bd_disk)->kobj);
+}
-/* /sys/kernel/debug/btrfs */
-static struct dentry *btrfs_debugfs_root_dentry;
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
+ const u8 *fsid)
+{
+ char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
+
+ /*
+ * Sprouting changes fsid of the mounted filesystem, rename the fsid
+ * directory
+ */
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid);
+ if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
+ btrfs_warn(fs_devices->fs_info,
+ "sysfs: failed to create fsid for sprout");
+}
-/* Debugging tunables and exported data */
-u64 btrfs_debugfs_test;
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
/*
* Can be called by the device discovery thread.
@@ -825,7 +1079,12 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
fs_devs->fsid_kobj.kset = btrfs_kset;
error = kobject_init_and_add(&fs_devs->fsid_kobj,
&btrfs_ktype, parent, "%pU", fs_devs->fsid);
- return error;
+ if (error) {
+ kobject_put(&fs_devs->fsid_kobj);
+ return error;
+ }
+
+ return 0;
}
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
@@ -851,6 +1110,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
if (error)
goto failure;
+#ifdef CONFIG_BTRFS_DEBUG
+ error = sysfs_create_group(fsid_kobj,
+ &btrfs_debug_feature_attr_group);
+ if (error)
+ goto failure;
+#endif
+
error = addrm_unknown_feature_attrs(fs_info, true);
if (error)
goto failure;
@@ -905,28 +1171,6 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
}
-static int btrfs_init_debugfs(void)
-{
-#ifdef CONFIG_DEBUG_FS
- btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
- if (!btrfs_debugfs_root_dentry)
- return -ENOMEM;
-
- /*
- * Example code, how to export data through debugfs.
- *
- * file: /sys/kernel/debug/btrfs/test
- * contents of: btrfs_debugfs_test
- */
-#ifdef CONFIG_BTRFS_DEBUG
- debugfs_create_u64("test", S_IRUGO | S_IWUSR, btrfs_debugfs_root_dentry,
- &btrfs_debugfs_test);
-#endif
-
-#endif
- return 0;
-}
-
int __init btrfs_init_sysfs(void)
{
int ret;
@@ -935,10 +1179,6 @@ int __init btrfs_init_sysfs(void)
if (!btrfs_kset)
return -ENOMEM;
- ret = btrfs_init_debugfs();
- if (ret)
- goto out1;
-
init_feature_attrs();
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
if (ret)
@@ -948,13 +1188,17 @@ int __init btrfs_init_sysfs(void)
if (ret)
goto out_remove_group;
+#ifdef CONFIG_BTRFS_DEBUG
+ ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
+ if (ret)
+ goto out2;
+#endif
+
return 0;
out_remove_group:
sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
out2:
- debugfs_remove_recursive(btrfs_debugfs_root_dentry);
-out1:
kset_unregister(btrfs_kset);
return ret;
@@ -966,6 +1210,5 @@ void __cold btrfs_exit_sysfs(void)
&btrfs_static_feature_attr_group);
sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
kset_unregister(btrfs_kset);
- debugfs_remove_recursive(btrfs_debugfs_root_dentry);
}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 40716b357c1d..e10c3adfc30f 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -3,10 +3,7 @@
#ifndef BTRFS_SYSFS_H
#define BTRFS_SYSFS_H
-/*
- * Data exported through sysfs
- */
-extern u64 btrfs_debugfs_test;
+#include <linux/kobject.h>
enum btrfs_feature_set {
FEAT_COMPAT,
@@ -15,71 +12,8 @@ enum btrfs_feature_set {
FEAT_MAX
};
-#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \
-{ \
- .attr = { .name = __stringify(_name), .mode = _mode }, \
- .show = _show, \
- .store = _store, \
-}
-
-#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \
- static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
- __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
-
-#define BTRFS_ATTR(_prefix, _name, _show) \
- static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
- __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
-
-#define BTRFS_ATTR_PTR(_prefix, _name) \
- (&btrfs_attr_##_prefix##_##_name.attr)
-
-
-struct btrfs_feature_attr {
- struct kobj_attribute kobj_attr;
- enum btrfs_feature_set feature_set;
- u64 feature_bit;
-};
-
-#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \
-static struct btrfs_feature_attr btrfs_attr_features_##_name = { \
- .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \
- btrfs_feature_attr_show, \
- btrfs_feature_attr_store), \
- .feature_set = _feature_set, \
- .feature_bit = _feature_prefix ##_## _feature_bit, \
-}
-#define BTRFS_FEAT_ATTR_PTR(_name) \
- (&btrfs_attr_features_##_name.kobj_attr.attr)
-
-#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
- BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
-#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
- BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature)
-#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
- BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
-
-/* convert from attribute */
-static inline struct btrfs_feature_attr *
-to_btrfs_feature_attr(struct kobj_attribute *a)
-{
- return container_of(a, struct btrfs_feature_attr, kobj_attr);
-}
-
-static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr)
-{
- return container_of(attr, struct kobj_attribute, attr);
-}
-
-static inline struct btrfs_feature_attr *
-attr_to_btrfs_feature_attr(struct attribute *attr)
-{
- return to_btrfs_feature_attr(attr_to_btrfs_attr(attr));
-}
-
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
-extern const char * const btrfs_feature_set_names[FEAT_MAX];
-extern struct kobj_type space_info_ktype;
-extern struct kobj_type btrfs_raid_ktype;
+const char * const btrfs_feature_set_name(enum btrfs_feature_set set);
int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
@@ -88,7 +22,19 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
struct kobject *parent);
int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
+ const u8 *fsid);
void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
u64 bit, enum btrfs_feature_set set);
+void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
+
+int __init btrfs_init_sysfs(void);
+void __cold btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache);
+int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info);
+void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
#endif
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 8a59597f1883..a7aca4141788 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -5,6 +5,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
#include <linux/magic.h>
#include "btrfs-tests.h"
#include "../ctree.h"
@@ -14,31 +15,50 @@
#include "../volumes.h"
#include "../disk-io.h"
#include "../qgroup.h"
+#include "../block-group.h"
static struct vfsmount *test_mnt = NULL;
+const char *test_error[] = {
+ [TEST_ALLOC_FS_INFO] = "cannot allocate fs_info",
+ [TEST_ALLOC_ROOT] = "cannot allocate root",
+ [TEST_ALLOC_EXTENT_BUFFER] = "cannot extent buffer",
+ [TEST_ALLOC_PATH] = "cannot allocate path",
+ [TEST_ALLOC_INODE] = "cannot allocate inode",
+ [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group",
+ [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
+};
+
static const struct super_operations btrfs_test_super_ops = {
.alloc_inode = btrfs_alloc_inode,
.destroy_inode = btrfs_test_destroy_inode,
};
-static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *data)
+
+static int btrfs_test_init_fs_context(struct fs_context *fc)
{
- return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
- NULL, BTRFS_TEST_MAGIC);
+ struct pseudo_fs_context *ctx = init_pseudo(fc, BTRFS_TEST_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->ops = &btrfs_test_super_ops;
+ return 0;
}
static struct file_system_type test_type = {
.name = "btrfs_test_fs",
- .mount = btrfs_test_mount,
+ .init_fs_context = btrfs_test_init_fs_context,
.kill_sb = kill_anon_super,
};
struct inode *btrfs_new_test_inode(void)
{
- return new_inode(test_mnt->mnt_sb);
+ struct inode *inode;
+
+ inode = new_inode(test_mnt->mnt_sb);
+ if (inode)
+ inode_init_owner(inode, NULL, S_IFREG);
+
+ return inode;
}
static int btrfs_init_test_fs(void)
@@ -99,7 +119,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->qgroup_lock);
- spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->tree_mod_seq_lock);
@@ -115,8 +134,10 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
- extent_io_tree_init(&fs_info->freed_extents[0], NULL);
- extent_io_tree_init(&fs_info->freed_extents[1], NULL);
+ extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
+ IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
+ extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
+ IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
fs_info->pinned_extents = &fs_info->freed_extents[0];
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
@@ -181,11 +202,11 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
kfree(root);
}
-struct btrfs_block_group_cache *
+struct btrfs_block_group *
btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
unsigned long length)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
cache = kzalloc(sizeof(*cache), GFP_KERNEL);
if (!cache)
@@ -197,9 +218,8 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
return NULL;
}
- cache->key.objectid = 0;
- cache->key.offset = length;
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->start = 0;
+ cache->length = length;
cache->full_stripe_len = fs_info->sectorsize;
cache->fs_info = fs_info;
@@ -212,7 +232,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
return cache;
}
-void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
{
if (!cache)
return;
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 70ff9f9d86a1..9e52527357d8 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -10,7 +10,22 @@
int btrfs_run_sanity_tests(void);
#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__)
-#define test_err(fmt, ...) pr_err("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__)
+#define test_err(fmt, ...) pr_err("BTRFS: selftest: %s:%d " fmt "\n", \
+ __FILE__, __LINE__, ##__VA_ARGS__)
+
+#define test_std_err(index) test_err("%s", test_error[index])
+
+enum {
+ TEST_ALLOC_FS_INFO,
+ TEST_ALLOC_ROOT,
+ TEST_ALLOC_EXTENT_BUFFER,
+ TEST_ALLOC_PATH,
+ TEST_ALLOC_INODE,
+ TEST_ALLOC_BLOCK_GROUP,
+ TEST_ALLOC_EXTENT_MAP,
+};
+
+extern const char *test_error[];
struct btrfs_root;
struct btrfs_trans_handle;
@@ -26,9 +41,9 @@ struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
void btrfs_free_dummy_root(struct btrfs_root *root);
-struct btrfs_block_group_cache *
+struct btrfs_block_group *
btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length);
-void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
+void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
#else
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 7d72eab6d32c..a1b9f9b5978e 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -30,27 +30,27 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_err("could not allocate fs_info");
+ test_std_err(TEST_ALLOC_FS_INFO);
return -ENOMEM;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_err("could not allocate root");
+ test_std_err(TEST_ALLOC_ROOT);
ret = PTR_ERR(root);
goto out;
}
path = btrfs_alloc_path();
if (!path) {
- test_err("could not allocate path");
+ test_std_err(TEST_ALLOC_PATH);
ret = -ENOMEM;
goto out;
}
path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize);
if (!eb) {
- test_err("could not allocate dummy buffer");
+ test_std_err(TEST_ALLOC_EXTENT_BUFFER);
ret = -ENOMEM;
goto out;
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 3c46d7f23456..123d9a614357 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -10,6 +10,7 @@
#include "btrfs-tests.h"
#include "../ctree.h"
#include "../extent_io.h"
+#include "../btrfs_inode.h"
#define PROCESS_UNLOCK (1 << 0)
#define PROCESS_RELEASE (1 << 1)
@@ -58,7 +59,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
static int test_find_delalloc(u32 sectorsize)
{
struct inode *inode;
- struct extent_io_tree tmp;
+ struct extent_io_tree *tmp;
struct page *page;
struct page *locked_page = NULL;
unsigned long index = 0;
@@ -73,11 +74,16 @@ static int test_find_delalloc(u32 sectorsize)
inode = btrfs_new_test_inode();
if (!inode) {
- test_err("failed to allocate test inode");
+ test_std_err(TEST_ALLOC_INODE);
return -ENOMEM;
}
+ tmp = &BTRFS_I(inode)->io_tree;
- extent_io_tree_init(&tmp, NULL);
+ /*
+ * Passing NULL as we don't have fs_info but tracepoints are not used
+ * at this point
+ */
+ extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL);
/*
* First go through and create and mark all of our pages dirty, we pin
@@ -104,10 +110,10 @@ static int test_find_delalloc(u32 sectorsize)
* |--- delalloc ---|
* |--- search ---|
*/
- set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);
+ set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
test_err("should have found at least one delalloc");
@@ -118,7 +124,7 @@ static int test_find_delalloc(u32 sectorsize)
sectorsize - 1, start, end);
goto out_bits;
}
- unlock_extent(&tmp, start, end);
+ unlock_extent(tmp, start, end);
unlock_page(locked_page);
put_page(locked_page);
@@ -135,10 +141,10 @@ static int test_find_delalloc(u32 sectorsize)
test_err("couldn't find the locked page");
goto out_bits;
}
- set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
+ set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
test_err("couldn't find delalloc in our range");
@@ -154,7 +160,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("there were unlocked pages in the range");
goto out_bits;
}
- unlock_extent(&tmp, start, end);
+ unlock_extent(tmp, start, end);
/* locked_page was unlocked above */
put_page(locked_page);
@@ -172,7 +178,7 @@ static int test_find_delalloc(u32 sectorsize)
}
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (found) {
test_err("found range when we shouldn't have");
@@ -190,10 +196,10 @@ static int test_find_delalloc(u32 sectorsize)
*
* We are re-using our test_start from above since it works out well.
*/
- set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);
+ set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
test_err("didn't find our range");
@@ -209,7 +215,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("pages in range were not all locked");
goto out_bits;
}
- unlock_extent(&tmp, start, end);
+ unlock_extent(tmp, start, end);
/*
* Now to test where we run into a page that is no longer dirty in the
@@ -234,7 +240,7 @@ static int test_find_delalloc(u32 sectorsize)
* this changes at any point in the future we will need to fix this
* tests expected behavior.
*/
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
test_err("didn't find our range");
@@ -252,7 +258,7 @@ static int test_find_delalloc(u32 sectorsize)
}
ret = 0;
out_bits:
- clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1);
+ clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
out:
if (locked_page)
put_page(locked_page);
@@ -374,8 +380,8 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
{
struct btrfs_fs_info *fs_info;
unsigned long len;
- unsigned long *bitmap;
- struct extent_buffer *eb;
+ unsigned long *bitmap = NULL;
+ struct extent_buffer *eb = NULL;
int ret;
test_msg("running extent buffer bitmap tests");
@@ -388,18 +394,23 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
? sectorsize * 4 : sectorsize;
fs_info = btrfs_alloc_dummy_fs_info(len, len);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ return -ENOMEM;
+ }
bitmap = kmalloc(len, GFP_KERNEL);
if (!bitmap) {
test_err("couldn't allocate test bitmap");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
eb = __alloc_dummy_extent_buffer(fs_info, 0, len);
if (!eb) {
- test_err("couldn't allocate test extent buffer");
- kfree(bitmap);
- return -ENOMEM;
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = -ENOMEM;
+ goto out;
}
ret = __test_eb_bitmaps(bitmap, eb, len);
@@ -408,17 +419,118 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
/* Do it over again with an extent buffer which isn't page-aligned. */
free_extent_buffer(eb);
- eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
+ eb = __alloc_dummy_extent_buffer(fs_info, nodesize / 2, len);
if (!eb) {
- test_err("couldn't allocate test extent buffer");
- kfree(bitmap);
- return -ENOMEM;
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = -ENOMEM;
+ goto out;
}
ret = __test_eb_bitmaps(bitmap, eb, len);
out:
free_extent_buffer(eb);
kfree(bitmap);
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
+
+static int test_find_first_clear_extent_bit(void)
+{
+ struct extent_io_tree tree;
+ u64 start, end;
+ int ret = -EINVAL;
+
+ test_msg("running find_first_clear_extent_bit test");
+ extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL);
+
+ /*
+ * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between
+ * 4M-32M
+ */
+ set_extent_bits(&tree, SZ_1M, SZ_4M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ if (start != 0 || end != SZ_1M - 1) {
+ test_err("error finding beginning range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ /* Now add 32M-64M so that we have a hole between 4M-32M */
+ set_extent_bits(&tree, SZ_32M, SZ_64M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ /*
+ * Request first hole starting at 12M, we should get 4M-32M
+ */
+ find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ if (start != SZ_4M || end != SZ_32M - 1) {
+ test_err("error finding trimmed range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ /*
+ * Search in the middle of allocated range, should get the next one
+ * available, which happens to be unallocated -> 4M-32M
+ */
+ find_first_clear_extent_bit(&tree, SZ_2M, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ if (start != SZ_4M || end != SZ_32M - 1) {
+ test_err("error finding next unalloc range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ /*
+ * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag
+ * being unset in this range, we should get the entry in range 64M-72M
+ */
+ set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED);
+ find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
+ CHUNK_TRIMMED);
+
+ if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) {
+ test_err("error finding exact range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end,
+ CHUNK_TRIMMED);
+
+ /*
+ * Search in the middle of set range whose immediate neighbour doesn't
+ * have the bits set so it must be returned
+ */
+ if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) {
+ test_err("error finding next alloc range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ /*
+ * Search beyond any known range, shall return after last known range
+ * and end should be -1
+ */
+ find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED);
+ if (start != SZ_64M + SZ_8M || end != -1) {
+ test_err(
+ "error handling beyond end of range search: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
return ret;
}
@@ -432,8 +544,11 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
if (ret)
goto out;
+ ret = test_find_first_clear_extent_bit();
+ if (ret)
+ goto out;
+
ret = test_eb_bitmaps(sectorsize, nodesize);
out:
- test_msg("extent I/O tests finished");
return ret;
}
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index bf15d3a7f20e..4a7f796c9900 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -47,7 +47,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
* ->add_extent_mapping(0, 16K)
* -> #handle -EEXIST
*/
-static void test_case_1(struct btrfs_fs_info *fs_info,
+static int test_case_1(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree)
{
struct extent_map *em;
@@ -56,55 +56,79 @@ static void test_case_1(struct btrfs_fs_info *fs_info,
int ret;
em = alloc_extent_map();
- if (!em)
- /* Skip the test on error. */
- return;
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
/* Add [0, 16K) */
em->start = 0;
em->len = SZ_16K;
em->block_start = 0;
em->block_len = SZ_16K;
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
- ASSERT(ret == 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [0, 16K)");
+ goto out;
+ }
free_extent_map(em);
/* Add [16K, 20K) following [0, 16K) */
em = alloc_extent_map();
- if (!em)
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
goto out;
+ }
em->start = SZ_16K;
em->len = SZ_4K;
em->block_start = SZ_32K; /* avoid merging */
em->block_len = SZ_4K;
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
- ASSERT(ret == 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [16K, 20K)");
+ goto out;
+ }
free_extent_map(em);
em = alloc_extent_map();
- if (!em)
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
goto out;
+ }
/* Add [0, 8K), should return [0, 16K) instead. */
em->start = start;
em->len = len;
em->block_start = start;
em->block_len = len;
+ write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
- if (ret)
+ write_unlock(&em_tree->lock);
+ if (ret) {
test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
+ goto out;
+ }
if (em &&
(em->start != 0 || extent_map_end(em) != SZ_16K ||
- em->block_start != 0 || em->block_len != SZ_16K))
+ em->block_start != 0 || em->block_len != SZ_16K)) {
test_err(
"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
start, start + len, ret, em->start, em->len,
em->block_start, em->block_len);
+ ret = -EINVAL;
+ }
free_extent_map(em);
out:
- /* free memory */
free_extent_map_tree(em_tree);
+
+ return ret;
}
/*
@@ -113,65 +137,89 @@ out:
* Reading the inline ending up with EEXIST, ie. read an inline
* extent and discard page cache and read it again.
*/
-static void test_case_2(struct btrfs_fs_info *fs_info,
+static int test_case_2(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree)
{
struct extent_map *em;
int ret;
em = alloc_extent_map();
- if (!em)
- /* Skip the test on error. */
- return;
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
/* Add [0, 1K) */
em->start = 0;
em->len = SZ_1K;
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
- ASSERT(ret == 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [0, 1K)");
+ goto out;
+ }
free_extent_map(em);
- /* Add [4K, 4K) following [0, 1K) */
+ /* Add [4K, 8K) following [0, 1K) */
em = alloc_extent_map();
- if (!em)
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
goto out;
+ }
em->start = SZ_4K;
em->len = SZ_4K;
em->block_start = SZ_4K;
em->block_len = SZ_4K;
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
- ASSERT(ret == 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [4K, 8K)");
+ goto out;
+ }
free_extent_map(em);
em = alloc_extent_map();
- if (!em)
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
goto out;
+ }
/* Add [0, 1K) */
em->start = 0;
em->len = SZ_1K;
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
+ write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
- if (ret)
+ write_unlock(&em_tree->lock);
+ if (ret) {
test_err("case2 [0 1K]: ret %d", ret);
+ goto out;
+ }
if (em &&
(em->start != 0 || extent_map_end(em) != SZ_1K ||
- em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1))
+ em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) {
test_err(
"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
ret, em->start, em->len, em->block_start,
em->block_len);
+ ret = -EINVAL;
+ }
free_extent_map(em);
out:
- /* free memory */
free_extent_map_tree(em_tree);
+
+ return ret;
}
-static void __test_case_3(struct btrfs_fs_info *fs_info,
+static int __test_case_3(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree, u64 start)
{
struct extent_map *em;
@@ -179,47 +227,63 @@ static void __test_case_3(struct btrfs_fs_info *fs_info,
int ret;
em = alloc_extent_map();
- if (!em)
- /* Skip this test on error. */
- return;
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
/* Add [4K, 8K) */
em->start = SZ_4K;
em->len = SZ_4K;
em->block_start = SZ_4K;
em->block_len = SZ_4K;
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
- ASSERT(ret == 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [4K, 8K)");
+ goto out;
+ }
free_extent_map(em);
em = alloc_extent_map();
- if (!em)
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
goto out;
+ }
/* Add [0, 16K) */
em->start = 0;
em->len = SZ_16K;
em->block_start = 0;
em->block_len = SZ_16K;
+ write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
- if (ret)
+ write_unlock(&em_tree->lock);
+ if (ret) {
test_err("case3 [0x%llx 0x%llx): ret %d",
start, start + len, ret);
+ goto out;
+ }
/*
* Since bytes within em are contiguous, em->block_start is identical to
* em->start.
*/
if (em &&
(start < em->start || start + len > extent_map_end(em) ||
- em->start != em->block_start || em->len != em->block_len))
+ em->start != em->block_start || em->len != em->block_len)) {
test_err(
"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
start, start + len, ret, em->start, em->len,
em->block_start, em->block_len);
+ ret = -EINVAL;
+ }
free_extent_map(em);
out:
- /* free memory */
free_extent_map_tree(em_tree);
+
+ return ret;
}
/*
@@ -238,15 +302,23 @@ out:
* -> add_extent_mapping()
* -> add_extent_mapping()
*/
-static void test_case_3(struct btrfs_fs_info *fs_info,
+static int test_case_3(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree)
{
- __test_case_3(fs_info, em_tree, 0);
- __test_case_3(fs_info, em_tree, SZ_8K);
- __test_case_3(fs_info, em_tree, (12 * 1024ULL));
+ int ret;
+
+ ret = __test_case_3(fs_info, em_tree, 0);
+ if (ret)
+ return ret;
+ ret = __test_case_3(fs_info, em_tree, SZ_8K);
+ if (ret)
+ return ret;
+ ret = __test_case_3(fs_info, em_tree, (12 * SZ_1K));
+
+ return ret;
}
-static void __test_case_4(struct btrfs_fs_info *fs_info,
+static int __test_case_4(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree, u64 start)
{
struct extent_map *em;
@@ -254,54 +326,77 @@ static void __test_case_4(struct btrfs_fs_info *fs_info,
int ret;
em = alloc_extent_map();
- if (!em)
- /* Skip this test on error. */
- return;
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
/* Add [0K, 8K) */
em->start = 0;
em->len = SZ_8K;
em->block_start = 0;
em->block_len = SZ_8K;
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
- ASSERT(ret == 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [0, 8K)");
+ goto out;
+ }
free_extent_map(em);
em = alloc_extent_map();
- if (!em)
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
goto out;
+ }
- /* Add [8K, 24K) */
+ /* Add [8K, 32K) */
em->start = SZ_8K;
- em->len = 24 * 1024ULL;
+ em->len = 24 * SZ_1K;
em->block_start = SZ_16K; /* avoid merging */
- em->block_len = 24 * 1024ULL;
+ em->block_len = 24 * SZ_1K;
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
- ASSERT(ret == 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [8K, 32K)");
+ goto out;
+ }
free_extent_map(em);
em = alloc_extent_map();
- if (!em)
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
goto out;
+ }
/* Add [0K, 32K) */
em->start = 0;
em->len = SZ_32K;
em->block_start = 0;
em->block_len = SZ_32K;
+ write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
- if (ret)
+ write_unlock(&em_tree->lock);
+ if (ret) {
test_err("case4 [0x%llx 0x%llx): ret %d",
start, len, ret);
- if (em &&
- (start < em->start || start + len > extent_map_end(em)))
+ goto out;
+ }
+ if (em && (start < em->start || start + len > extent_map_end(em))) {
test_err(
"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
start, len, ret, em->start, em->len, em->block_start,
em->block_len);
+ ret = -EINVAL;
+ }
free_extent_map(em);
out:
- /* free memory */
free_extent_map_tree(em_tree);
+
+ return ret;
}
/*
@@ -329,17 +424,24 @@ out:
* # handle -EEXIST when adding
* # [0, 32K)
*/
-static void test_case_4(struct btrfs_fs_info *fs_info,
+static int test_case_4(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree)
{
- __test_case_4(fs_info, em_tree, 0);
- __test_case_4(fs_info, em_tree, SZ_4K);
+ int ret;
+
+ ret = __test_case_4(fs_info, em_tree, 0);
+ if (ret)
+ return ret;
+ ret = __test_case_4(fs_info, em_tree, SZ_4K);
+
+ return ret;
}
int btrfs_test_extent_map(void)
{
struct btrfs_fs_info *fs_info = NULL;
struct extent_map_tree *em_tree;
+ int ret = 0;
test_msg("running extent_map tests");
@@ -349,25 +451,32 @@ int btrfs_test_extent_map(void)
*/
fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE);
if (!fs_info) {
- test_msg("Couldn't allocate dummy fs info");
+ test_std_err(TEST_ALLOC_FS_INFO);
return -ENOMEM;
}
em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL);
- if (!em_tree)
- /* Skip the test on error. */
+ if (!em_tree) {
+ ret = -ENOMEM;
goto out;
+ }
extent_map_tree_init(em_tree);
- test_case_1(fs_info, em_tree);
- test_case_2(fs_info, em_tree);
- test_case_3(fs_info, em_tree);
- test_case_4(fs_info, em_tree);
+ ret = test_case_1(fs_info, em_tree);
+ if (ret)
+ goto out;
+ ret = test_case_2(fs_info, em_tree);
+ if (ret)
+ goto out;
+ ret = test_case_3(fs_info, em_tree);
+ if (ret)
+ goto out;
+ ret = test_case_4(fs_info, em_tree);
- kfree(em_tree);
out:
+ kfree(em_tree);
btrfs_free_dummy_fs_info(fs_info);
- return 0;
+ return ret;
}
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 5c2f77e9439b..aebdf23f0cdd 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -8,6 +8,7 @@
#include "../ctree.h"
#include "../disk-io.h"
#include "../free-space-cache.h"
+#include "../block-group.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
@@ -16,7 +17,7 @@
* entry and remove space from either end and the middle, and make sure we can
* remove space that covers adjacent extent entries.
*/
-static int test_extents(struct btrfs_block_group_cache *cache)
+static int test_extents(struct btrfs_block_group *cache)
{
int ret = 0;
@@ -86,8 +87,7 @@ static int test_extents(struct btrfs_block_group_cache *cache)
return 0;
}
-static int test_bitmaps(struct btrfs_block_group_cache *cache,
- u32 sectorsize)
+static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize)
{
u64 next_bitmap_offset;
int ret;
@@ -155,7 +155,7 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache,
}
/* This is the high grade jackassery */
-static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
+static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
u32 sectorsize)
{
u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize);
@@ -330,7 +330,7 @@ static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl,
/* Used by test_steal_space_from_bitmap_to_extent(). */
static int
-check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
+check_num_extents_and_bitmaps(const struct btrfs_block_group *cache,
const int num_extents,
const int num_bitmaps)
{
@@ -350,7 +350,7 @@ check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
}
/* Used by test_steal_space_from_bitmap_to_extent(). */
-static int check_cache_empty(struct btrfs_block_group_cache *cache)
+static int check_cache_empty(struct btrfs_block_group *cache)
{
u64 offset;
u64 max_extent_size;
@@ -392,7 +392,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache)
* requests.
*/
static int
-test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
+test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
u32 sectorsize)
{
int ret;
@@ -404,7 +404,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
};
const struct btrfs_free_space_op *orig_free_space_ops;
- test_msg("running space stealing from bitmap to extent");
+ test_msg("running space stealing from bitmap to extent tests");
/*
* For this test, we want to ensure we end up with an extent entry
@@ -828,15 +828,16 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
{
struct btrfs_fs_info *fs_info;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
struct btrfs_root *root = NULL;
int ret = -ENOMEM;
test_msg("running btrfs free space cache tests");
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
- if (!fs_info)
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
return -ENOMEM;
-
+ }
/*
* For ppc64 (with 64k page size), bytes per bitmap might be
@@ -846,13 +847,14 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
cache = btrfs_alloc_dummy_block_group(fs_info,
BITS_PER_BITMAP * sectorsize + PAGE_SIZE);
if (!cache) {
- test_err("couldn't run the tests");
+ test_std_err(TEST_ALLOC_BLOCK_GROUP);
btrfs_free_dummy_fs_info(fs_info);
return 0;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
ret = PTR_ERR(root);
goto out;
}
@@ -874,6 +876,5 @@ out:
btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
- test_msg("free space cache tests finished");
return ret;
}
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 89346da890cf..914eea5ba6a7 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -9,6 +9,7 @@
#include "../disk-io.h"
#include "../free-space-tree.h"
#include "../transaction.h"
+#include "../block-group.h"
struct free_space_extent {
u64 start;
@@ -17,7 +18,7 @@ struct free_space_extent {
static int __check_free_space_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
const struct free_space_extent * const extents,
unsigned int num_extents)
@@ -30,7 +31,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
unsigned int i;
int ret;
- info = search_free_space_info(trans, fs_info, cache, path, 0);
+ info = search_free_space_info(trans, cache, path, 0);
if (IS_ERR(info)) {
test_err("could not find free space info");
ret = PTR_ERR(info);
@@ -47,7 +48,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
if (path->slots[0] != 0)
goto invalid;
- end = cache->key.objectid + cache->key.offset;
+ end = cache->start + cache->length;
i = 0;
while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -106,7 +107,7 @@ invalid:
static int check_free_space_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
const struct free_space_extent * const extents,
unsigned int num_extents)
@@ -115,7 +116,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
u32 flags;
int ret;
- info = search_free_space_info(trans, fs_info, cache, path, 0);
+ info = search_free_space_info(trans, cache, path, 0);
if (IS_ERR(info)) {
test_err("could not find free space info");
btrfs_release_path(path);
@@ -149,12 +150,12 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
static int test_empty_block_group(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, cache->key.offset},
+ {cache->start, cache->length},
};
return check_free_space_extents(trans, fs_info, cache, path,
@@ -163,7 +164,7 @@ static int test_empty_block_group(struct btrfs_trans_handle *trans,
static int test_remove_all(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
@@ -171,8 +172,8 @@ static int test_remove_all(struct btrfs_trans_handle *trans,
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start,
+ cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -184,18 +185,17 @@ static int test_remove_all(struct btrfs_trans_handle *trans,
static int test_remove_beginning(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid + alignment,
- cache->key.offset - alignment},
+ {cache->start + alignment, cache->length - alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid, alignment);
+ cache->start, alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -208,19 +208,18 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans,
static int test_remove_end(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, cache->key.offset - alignment},
+ {cache->start, cache->length - alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid +
- cache->key.offset - alignment,
- alignment);
+ cache->start + cache->length - alignment,
+ alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -232,19 +231,18 @@ static int test_remove_end(struct btrfs_trans_handle *trans,
static int test_remove_middle(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, alignment},
- {cache->key.objectid + 2 * alignment,
- cache->key.offset - 2 * alignment},
+ {cache->start, alignment},
+ {cache->start + 2 * alignment, cache->length - 2 * alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid + alignment,
+ cache->start + alignment,
alignment);
if (ret) {
test_err("could not remove free space");
@@ -257,24 +255,23 @@ static int test_remove_middle(struct btrfs_trans_handle *trans,
static int test_merge_left(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, 2 * alignment},
+ {cache->start, 2 * alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ ret = __add_to_free_space_tree(trans, cache, path, cache->start,
alignment);
if (ret) {
test_err("could not add free space");
@@ -282,7 +279,7 @@ static int test_merge_left(struct btrfs_trans_handle *trans,
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + alignment,
+ cache->start + alignment,
alignment);
if (ret) {
test_err("could not add free space");
@@ -295,25 +292,24 @@ static int test_merge_left(struct btrfs_trans_handle *trans,
static int test_merge_right(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid + alignment, 2 * alignment},
+ {cache->start + alignment, 2 * alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + 2 * alignment,
+ cache->start + 2 * alignment,
alignment);
if (ret) {
test_err("could not add free space");
@@ -321,7 +317,7 @@ static int test_merge_right(struct btrfs_trans_handle *trans,
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + alignment,
+ cache->start + alignment,
alignment);
if (ret) {
test_err("could not add free space");
@@ -334,24 +330,23 @@ static int test_merge_right(struct btrfs_trans_handle *trans,
static int test_merge_both(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, 3 * alignment},
+ {cache->start, 3 * alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ ret = __add_to_free_space_tree(trans, cache, path, cache->start,
alignment);
if (ret) {
test_err("could not add free space");
@@ -359,16 +354,14 @@ static int test_merge_both(struct btrfs_trans_handle *trans,
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + 2 * alignment,
- alignment);
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + alignment,
- alignment);
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -380,26 +373,25 @@ static int test_merge_both(struct btrfs_trans_handle *trans,
static int test_merge_none(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, alignment},
- {cache->key.objectid + 2 * alignment, alignment},
- {cache->key.objectid + 4 * alignment, alignment},
+ {cache->start, alignment},
+ {cache->start + 2 * alignment, alignment},
+ {cache->start + 4 * alignment, alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ ret = __add_to_free_space_tree(trans, cache, path, cache->start,
alignment);
if (ret) {
test_err("could not add free space");
@@ -407,16 +399,14 @@ static int test_merge_none(struct btrfs_trans_handle *trans,
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + 4 * alignment,
- alignment);
+ cache->start + 4 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + 2 * alignment,
- alignment);
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -428,7 +418,7 @@ static int test_merge_none(struct btrfs_trans_handle *trans,
typedef int (*test_func_t)(struct btrfs_trans_handle *,
struct btrfs_fs_info *,
- struct btrfs_block_group_cache *,
+ struct btrfs_block_group *,
struct btrfs_path *,
u32 alignment);
@@ -437,21 +427,21 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
{
struct btrfs_fs_info *fs_info;
struct btrfs_root *root = NULL;
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
struct btrfs_trans_handle trans;
struct btrfs_path *path = NULL;
int ret;
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_err("couldn't allocate dummy fs info");
+ test_std_err(TEST_ALLOC_FS_INFO);
ret = -ENOMEM;
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_err("couldn't allocate dummy root");
+ test_std_err(TEST_ALLOC_ROOT);
ret = PTR_ERR(root);
goto out;
}
@@ -462,9 +452,9 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
root->fs_info->tree_root = root;
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
- if (!root->node) {
- test_err("couldn't allocate dummy buffer");
- ret = -ENOMEM;
+ if (IS_ERR(root->node)) {
+ test_std_err(TEST_ALLOC_EXTENT_BUFFER);
+ ret = PTR_ERR(root->node);
goto out;
}
btrfs_set_header_level(root->node, 0);
@@ -473,7 +463,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment);
if (!cache) {
- test_err("couldn't allocate dummy block group cache");
+ test_std_err(TEST_ALLOC_BLOCK_GROUP);
ret = -ENOMEM;
goto out;
}
@@ -486,7 +476,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
path = btrfs_alloc_path();
if (!path) {
- test_err("couldn't allocate path");
+ test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
goto out;
}
@@ -539,7 +529,7 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize,
ret = run_test(test_func, 0, sectorsize, nodesize, alignment);
if (ret) {
test_err(
- "%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u",
+ "%ps failed with extents, sectorsize=%u, nodesize=%u, alignment=%u",
test_func, sectorsize, nodesize, alignment);
test_ret = ret;
}
@@ -547,7 +537,7 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize,
ret = run_test(test_func, 1, sectorsize, nodesize, alignment);
if (ret) {
test_err(
- "%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u",
+ "%ps failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u",
test_func, sectorsize, nodesize, alignment);
test_ret = ret;
}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index af0c8e30d9e2..09ecf7dc7b08 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -226,31 +226,34 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
u64 offset;
int ret = -ENOMEM;
+ test_msg("running btrfs_get_extent tests");
+
inode = btrfs_new_test_inode();
if (!inode) {
- test_err("couldn't allocate inode");
+ test_std_err(TEST_ALLOC_INODE);
return ret;
}
+ inode->i_mode = S_IFREG;
BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.offset = 0;
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_err("couldn't allocate dummy fs info");
+ test_std_err(TEST_ALLOC_FS_INFO);
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_err("couldn't allocate root");
+ test_std_err(TEST_ALLOC_ROOT);
goto out;
}
root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
if (!root->node) {
- test_err("couldn't allocate dummy buffer");
+ test_std_err(TEST_ALLOC_ROOT);
goto out;
}
@@ -827,9 +830,11 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
struct extent_map *em = NULL;
int ret = -ENOMEM;
+ test_msg("running hole first btrfs_get_extent test");
+
inode = btrfs_new_test_inode();
if (!inode) {
- test_err("couldn't allocate inode");
+ test_std_err(TEST_ALLOC_INODE);
return ret;
}
@@ -839,19 +844,19 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_err("couldn't allocate dummy fs info");
+ test_std_err(TEST_ALLOC_FS_INFO);
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_err("couldn't allocate root");
+ test_std_err(TEST_ALLOC_ROOT);
goto out;
}
root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
if (!root->node) {
- test_err("couldn't allocate dummy buffer");
+ test_std_err(TEST_ALLOC_ROOT);
goto out;
}
@@ -927,21 +932,23 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
struct btrfs_root *root = NULL;
int ret = -ENOMEM;
+ test_msg("running outstanding_extents tests");
+
inode = btrfs_new_test_inode();
if (!inode) {
- test_err("couldn't allocate inode");
+ test_std_err(TEST_ALLOC_INODE);
return ret;
}
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_err("couldn't allocate dummy fs info");
+ test_std_err(TEST_ALLOC_FS_INFO);
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_err("couldn't allocate root");
+ test_std_err(TEST_ALLOC_ROOT);
goto out;
}
@@ -950,7 +957,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* [BTRFS_MAX_EXTENT_SIZE] */
ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
- NULL, 0);
+ NULL);
if (ret) {
test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
@@ -965,7 +972,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
- 0, NULL, 0);
+ 0, NULL);
if (ret) {
test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
@@ -981,8 +988,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_DIRTY |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -998,7 +1004,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1)
+ sectorsize - 1,
- 0, NULL, 0);
+ 0, NULL);
if (ret) {
test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
@@ -1016,7 +1022,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = btrfs_set_extent_delalloc(inode,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
- 0, NULL, 0);
+ 0, NULL);
if (ret) {
test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
@@ -1033,7 +1039,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
*/
ret = btrfs_set_extent_delalloc(inode,
BTRFS_MAX_EXTENT_SIZE + sectorsize,
- BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
if (ret) {
test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
@@ -1049,8 +1055,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1068,7 +1073,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
*/
ret = btrfs_set_extent_delalloc(inode,
BTRFS_MAX_EXTENT_SIZE + sectorsize,
- BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
if (ret) {
test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
@@ -1082,8 +1087,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* Empty */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1098,8 +1102,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
out:
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
@@ -1110,17 +1113,16 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
{
int ret;
+ test_msg("running inode tests");
+
set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
- test_msg("running btrfs_get_extent tests");
ret = test_btrfs_get_extent(sectorsize, nodesize);
if (ret)
return ret;
- test_msg("running hole first btrfs_get_extent test");
ret = test_hole_first(sectorsize, nodesize);
if (ret)
return ret;
- test_msg("running outstanding_extents tests");
return test_extent_accounting(sectorsize, nodesize);
}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 412b910b04cc..ac035a6fa003 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -32,7 +32,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
path = btrfs_alloc_path();
if (!path) {
- test_err("couldn't allocate path");
+ test_std_err(TEST_ALLOC_ROOT);
return -ENOMEM;
}
@@ -82,7 +82,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
path = btrfs_alloc_path();
if (!path) {
- test_err("couldn't allocate path");
+ test_std_err(TEST_ALLOC_ROOT);
return -ENOMEM;
}
@@ -132,7 +132,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
path = btrfs_alloc_path();
if (!path) {
- test_err("couldn't allocate path");
+ test_std_err(TEST_ALLOC_ROOT);
return -ENOMEM;
}
path->leave_spinning = 1;
@@ -166,7 +166,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
path = btrfs_alloc_path();
if (!path) {
- test_err("couldn't allocate path");
+ test_std_err(TEST_ALLOC_ROOT);
return -ENOMEM;
}
@@ -215,7 +215,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
btrfs_init_dummy_trans(&trans, fs_info);
- test_msg("qgroup basic add");
+ test_msg("running qgroup add/remove tests");
ret = btrfs_create_qgroup(&trans, BTRFS_FS_TREE_OBJECTID);
if (ret) {
test_err("couldn't create a qgroup %d", ret);
@@ -316,7 +316,7 @@ static int test_multiple_refs(struct btrfs_root *root,
btrfs_init_dummy_trans(&trans, fs_info);
- test_msg("qgroup multiple refs test");
+ test_msg("running qgroup multiple refs test");
/*
* We have BTRFS_FS_TREE_OBJECTID created already from the
@@ -457,13 +457,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_err("couldn't allocate dummy fs info");
+ test_std_err(TEST_ALLOC_FS_INFO);
return -ENOMEM;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_err("couldn't allocate root");
+ test_std_err(TEST_ALLOC_ROOT);
ret = PTR_ERR(root);
goto out;
}
@@ -484,9 +484,9 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
* *cough*backref walking code*cough*
*/
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
- if (!root->node) {
+ if (IS_ERR(root->node)) {
test_err("couldn't allocate dummy buffer");
- ret = -ENOMEM;
+ ret = PTR_ERR(root->node);
goto out;
}
btrfs_set_header_level(root->node, 0);
@@ -495,7 +495,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
tmp_root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(tmp_root)) {
- test_err("couldn't allocate a fs root");
+ test_std_err(TEST_ALLOC_ROOT);
ret = PTR_ERR(tmp_root);
goto out;
}
@@ -510,7 +510,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
tmp_root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(tmp_root)) {
- test_err("couldn't allocate a fs root");
+ test_std_err(TEST_ALLOC_ROOT);
ret = PTR_ERR(tmp_root);
goto out;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e4e665f422fc..cfc08ef9b876 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
+#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -19,24 +20,98 @@
#include "volumes.h"
#include "dev-replace.h"
#include "qgroup.h"
+#include "block-group.h"
#define BTRFS_ROOT_TRANS_TAG 0
+/*
+ * Transaction states and transitions
+ *
+ * No running transaction (fs tree blocks are not modified)
+ * |
+ * | To next stage:
+ * | Call start_transaction() variants. Except btrfs_join_transaction_nostart().
+ * V
+ * Transaction N [[TRANS_STATE_RUNNING]]
+ * |
+ * | New trans handles can be attached to transaction N by calling all
+ * | start_transaction() variants.
+ * |
+ * | To next stage:
+ * | Call btrfs_commit_transaction() on any trans handle attached to
+ * | transaction N
+ * V
+ * Transaction N [[TRANS_STATE_COMMIT_START]]
+ * |
+ * | Will wait for previous running transaction to completely finish if there
+ * | is one
+ * |
+ * | Then one of the following happes:
+ * | - Wait for all other trans handle holders to release.
+ * | The btrfs_commit_transaction() caller will do the commit work.
+ * | - Wait for current transaction to be committed by others.
+ * | Other btrfs_commit_transaction() caller will do the commit work.
+ * |
+ * | At this stage, only btrfs_join_transaction*() variants can attach
+ * | to this running transaction.
+ * | All other variants will wait for current one to finish and attach to
+ * | transaction N+1.
+ * |
+ * | To next stage:
+ * | Caller is chosen to commit transaction N, and all other trans handle
+ * | haven been released.
+ * V
+ * Transaction N [[TRANS_STATE_COMMIT_DOING]]
+ * |
+ * | The heavy lifting transaction work is started.
+ * | From running delayed refs (modifying extent tree) to creating pending
+ * | snapshots, running qgroups.
+ * | In short, modify supporting trees to reflect modifications of subvolume
+ * | trees.
+ * |
+ * | At this stage, all start_transaction() calls will wait for this
+ * | transaction to finish and attach to transaction N+1.
+ * |
+ * | To next stage:
+ * | Until all supporting trees are updated.
+ * V
+ * Transaction N [[TRANS_STATE_UNBLOCKED]]
+ * | Transaction N+1
+ * | All needed trees are modified, thus we only [[TRANS_STATE_RUNNING]]
+ * | need to write them back to disk and update |
+ * | super blocks. |
+ * | |
+ * | At this stage, new transaction is allowed to |
+ * | start. |
+ * | All new start_transaction() calls will be |
+ * | attached to transid N+1. |
+ * | |
+ * | To next stage: |
+ * | Until all tree blocks are super blocks are |
+ * | written to block devices |
+ * V |
+ * Transaction N [[TRANS_STATE_COMPLETED]] V
+ * All tree blocks and super blocks are written. Transaction N+1
+ * This transaction is finished and all its [[TRANS_STATE_COMMIT_START]]
+ * data structures will be cleaned up. | Life goes on
+ */
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING] = 0U,
- [TRANS_STATE_BLOCKED] = __TRANS_START,
[TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH),
[TRANS_STATE_COMMIT_DOING] = (__TRANS_START |
__TRANS_ATTACH |
- __TRANS_JOIN),
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOSTART),
[TRANS_STATE_UNBLOCKED] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
- __TRANS_JOIN_NOLOCK),
+ __TRANS_JOIN_NOLOCK |
+ __TRANS_JOIN_NOSTART),
[TRANS_STATE_COMPLETED] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
- __TRANS_JOIN_NOLOCK),
+ __TRANS_JOIN_NOLOCK |
+ __TRANS_JOIN_NOSTART),
};
void btrfs_put_transaction(struct btrfs_transaction *transaction)
@@ -50,14 +125,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
btrfs_err(transaction->fs_info,
"pending csums is %llu",
transaction->delayed_refs.pending_csums);
- while (!list_empty(&transaction->pending_chunks)) {
- struct extent_map *em;
-
- em = list_first_entry(&transaction->pending_chunks,
- struct extent_map, list);
- list_del_init(&em->list);
- free_extent_map(em);
- }
/*
* If any block groups are found in ->deleted_bgs then it's
* because the transaction was aborted and a commit did not
@@ -66,48 +133,20 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
* discard the physical locations of the block groups.
*/
while (!list_empty(&transaction->deleted_bgs)) {
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
cache = list_first_entry(&transaction->deleted_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
bg_list);
list_del_init(&cache->bg_list);
btrfs_put_block_group_trimming(cache);
btrfs_put_block_group(cache);
}
+ WARN_ON(!list_empty(&transaction->dev_update_list));
kfree(transaction);
}
}
-static void clear_btree_io_tree(struct extent_io_tree *tree)
-{
- spin_lock(&tree->lock);
- /*
- * Do a single barrier for the waitqueue_active check here, the state
- * of the waitqueue should not change once clear_btree_io_tree is
- * called.
- */
- smp_mb();
- while (!RB_EMPTY_ROOT(&tree->state)) {
- struct rb_node *node;
- struct extent_state *state;
-
- node = rb_first(&tree->state);
- state = rb_entry(node, struct extent_state, rb_node);
- rb_erase(&state->rb_node, &tree->state);
- RB_CLEAR_NODE(&state->rb_node);
- /*
- * btree io trees aren't supposed to have tasks waiting for
- * changes in the flags of extent states ever.
- */
- ASSERT(!waitqueue_active(&state->wq));
- free_extent_state(state);
-
- cond_resched_lock(&tree->lock);
- }
- spin_unlock(&tree->lock);
-}
-
static noinline void switch_commit_roots(struct btrfs_transaction *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -121,7 +160,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
root->commit_root = btrfs_root_node(root);
if (is_fstree(root->root_key.objectid))
btrfs_unpin_free_ino(root);
- clear_btree_io_tree(&root->dirty_log_pages);
+ extent_io_tree_release(&root->dirty_log_pages);
btrfs_qgroup_clean_swapped_blocks(root);
}
@@ -165,6 +204,24 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
}
/*
+ * To be called after all the new block groups attached to the transaction
+ * handle have been created (btrfs_create_pending_block_groups()).
+ */
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ if (!trans->chunk_bytes_reserved)
+ return;
+
+ WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+
+ btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
+ trans->chunk_bytes_reserved);
+ trans->chunk_bytes_reserved = 0;
+}
+
+/*
* either allocate a new transaction or hop into the existing one
*/
static noinline int join_transaction(struct btrfs_fs_info *fs_info,
@@ -263,19 +320,18 @@ loop:
spin_lock_init(&cur_trans->delayed_refs.lock);
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
- INIT_LIST_HEAD(&cur_trans->pending_chunks);
+ INIT_LIST_HEAD(&cur_trans->dev_update_list);
INIT_LIST_HEAD(&cur_trans->switch_commits);
INIT_LIST_HEAD(&cur_trans->dirty_bgs);
INIT_LIST_HEAD(&cur_trans->io_bgs);
INIT_LIST_HEAD(&cur_trans->dropped_roots);
mutex_init(&cur_trans->cache_write_mutex);
- cur_trans->num_dirty_bgs = 0;
spin_lock_init(&cur_trans->dirty_bgs_lock);
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
spin_lock_init(&cur_trans->dropped_roots_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
- extent_io_tree_init(&cur_trans->dirty_pages,
- fs_info->btree_inode);
+ extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
+ IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
fs_info->generation++;
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@@ -397,7 +453,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
- return (trans->state >= TRANS_STATE_BLOCKED &&
+ return (trans->state >= TRANS_STATE_COMMIT_START &&
trans->state < TRANS_STATE_UNBLOCKED &&
!trans->aborted);
}
@@ -500,7 +556,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
* worth of delayed refs updates in this trans handle, and
* refill that amount for whatever is missing in the reserve.
*/
- num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
if (delayed_refs_rsv->full == 0) {
delayed_refs_bytes = num_bytes;
num_bytes <<= 1;
@@ -562,7 +618,8 @@ again:
ret = join_transaction(fs_info, type);
if (ret == -EBUSY) {
wait_current_trans(fs_info);
- if (unlikely(type == TRANS_ATTACH))
+ if (unlikely(type == TRANS_ATTACH ||
+ type == TRANS_JOIN_NOSTART))
ret = -ENOENT;
}
} while (ret == -EBUSY);
@@ -583,7 +640,7 @@ again:
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
- if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
may_wait_transaction(fs_info, type)) {
current->journal_info = h;
btrfs_commit_transaction(h);
@@ -650,7 +707,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
if (IS_ERR(trans))
return trans;
- num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv,
num_bytes, min_factor);
if (ret) {
@@ -672,13 +729,23 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
true);
}
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
+struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
BTRFS_RESERVE_NO_FLUSH, true);
}
/*
+ * Similar to regular join but it never starts a transaction when none is
+ * running or after waiting for the current one to finish.
+ */
+struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root)
+{
+ return start_transaction(root, 0, TRANS_JOIN_NOSTART,
+ BTRFS_RESERVE_NO_FLUSH, true);
+}
+
+/*
* btrfs_attach_transaction() - catch the running transaction
*
* It is used when we want to commit the current the transaction, but
@@ -801,7 +868,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
smp_mb();
- if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
cur_trans->delayed_refs.flushing)
return 1;
@@ -834,7 +901,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
- int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
if (refcount_read(&trans->use_count) > 1) {
@@ -850,13 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_chunk_metadata(trans);
- if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
- if (throttle)
- return btrfs_commit_transaction(trans);
- else
- wake_up_process(info->transaction_kthread);
- }
-
if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(info->sb);
@@ -928,7 +987,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
* superblock that points to btree nodes/leafs for which
* writeback hasn't finished yet (and without errors).
* We cleanup any entries left in the io tree when committing
- * the transaction (through clear_btree_io_tree()).
+ * the transaction (through extent_io_tree_release()).
*/
if (err == -ENOMEM) {
err = 0;
@@ -973,7 +1032,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* left in the io tree. For a log commit, we don't remove them
* after committing the log because the tree can be accessed
* concurrently - we do it only at transaction commit time when
- * it's safe to do it (through clear_btree_io_tree()).
+ * it's safe to do it (through extent_io_tree_release()).
*/
err = clear_extent_bit(dirty_pages, start, end,
EXTENT_NEED_WAIT, 0, 0, &cached_state);
@@ -993,7 +1052,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
return werr;
}
-int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
+static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
bool errors = false;
@@ -1051,7 +1110,7 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
blk_finish_plug(&plug);
ret2 = btrfs_wait_extents(fs_info, dirty_pages);
- clear_btree_io_tree(&trans->transaction->dirty_pages);
+ extent_io_tree_release(&trans->transaction->dirty_pages);
if (ret)
return ret;
@@ -1130,17 +1189,17 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
if (ret)
return ret;
- ret = btrfs_run_dev_stats(trans, fs_info);
+ ret = btrfs_run_dev_stats(trans);
if (ret)
return ret;
- ret = btrfs_run_dev_replace(trans, fs_info);
+ ret = btrfs_run_dev_replace(trans);
if (ret)
return ret;
ret = btrfs_run_qgroups(trans);
if (ret)
return ret;
- ret = btrfs_setup_space_cache(trans, fs_info);
+ ret = btrfs_setup_space_cache(trans);
if (ret)
return ret;
@@ -1168,7 +1227,7 @@ again:
}
while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
- ret = btrfs_write_dirty_block_groups(trans, fs_info);
+ ret = btrfs_write_dirty_block_groups(trans);
if (ret)
return ret;
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
@@ -1878,7 +1937,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_block_group *block_group, *tmp;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
btrfs_delayed_refs_rsv_release(fs_info, 1);
@@ -1952,6 +2011,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
struct btrfs_transaction *prev_trans = NULL;
int ret;
+ ASSERT(refcount_read(&trans->use_count) == 1);
+
/* Stop the commit early if ->aborted is set */
if (unlikely(READ_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
@@ -2056,6 +2117,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
}
} else {
spin_unlock(&fs_info->trans_lock);
+ /*
+ * The previous transaction was aborted and was already removed
+ * from the list of transactions at fs_info->trans_list. So we
+ * abort to prevent writing a new superblock that reflects a
+ * corrupt state (pointing to trees with unwritten nodes/leafs).
+ */
+ if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+ ret = -EROFS;
+ goto cleanup_transaction;
+ }
}
extwriter_counter_dec(cur_trans, trans->type);
@@ -2241,8 +2312,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
memcpy(fs_info->super_for_commit, fs_info->super_copy,
sizeof(*fs_info->super_copy));
- btrfs_update_commit_device_size(fs_info);
- btrfs_update_commit_device_bytes_used(cur_trans);
+ btrfs_commit_device_sizes(cur_trans);
clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f1ba78949d1b..49f7196368f5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -13,7 +13,6 @@
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
- TRANS_STATE_BLOCKED,
TRANS_STATE_COMMIT_START,
TRANS_STATE_COMMIT_DOING,
TRANS_STATE_UNBLOCKED,
@@ -51,7 +50,7 @@ struct btrfs_transaction {
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
- struct list_head pending_chunks;
+ struct list_head dev_update_list;
struct list_head switch_commits;
struct list_head dirty_bgs;
@@ -80,7 +79,6 @@ struct btrfs_transaction {
*/
struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
- unsigned int num_dirty_bgs;
/* Protected by spin lock fs_info->unused_bgs_lock. */
struct list_head deleted_bgs;
spinlock_t dropped_roots_lock;
@@ -95,11 +93,13 @@ struct btrfs_transaction {
#define __TRANS_JOIN (1U << 11)
#define __TRANS_JOIN_NOLOCK (1U << 12)
#define __TRANS_DUMMY (1U << 13)
+#define __TRANS_JOIN_NOSTART (1U << 14)
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
#define TRANS_ATTACH (__TRANS_ATTACH)
#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
+#define TRANS_JOIN_NOSTART (__TRANS_JOIN_NOSTART)
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
@@ -120,7 +120,6 @@ struct btrfs_trans_handle {
bool allocating_chunk;
bool can_flush_pending_bgs;
bool reloc_reserved;
- bool sync;
bool dirty;
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
@@ -184,7 +183,8 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
unsigned int num_items,
int min_factor);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
struct btrfs_root *root);
@@ -217,8 +217,6 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark);
-int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *dirty_pages);
int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
@@ -226,5 +224,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction);
void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
#endif
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index a62e1e837a89..97f3520b8d98 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -15,11 +15,15 @@
* carefully reviewed otherwise so it does not prevent mount of valid images.
*/
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/error-injection.h>
#include "ctree.h"
#include "tree-checker.h"
#include "disk-io.h"
#include "compression.h"
#include "volumes.h"
+#include "misc.h"
/*
* Error message should follow the following format:
@@ -41,12 +45,12 @@
* Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt.
* Allows callers to customize the output.
*/
-__printf(4, 5)
+__printf(3, 4)
__cold
-static void generic_err(const struct btrfs_fs_info *fs_info,
- const struct extent_buffer *eb, int slot,
+static void generic_err(const struct extent_buffer *eb, int slot,
const char *fmt, ...)
{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
struct va_format vaf;
va_list args;
@@ -66,12 +70,12 @@ static void generic_err(const struct btrfs_fs_info *fs_info,
* Customized reporter for extent data item, since its key objectid and
* offset has its own meaning.
*/
-__printf(4, 5)
+__printf(3, 4)
__cold
-static void file_extent_err(const struct btrfs_fs_info *fs_info,
- const struct extent_buffer *eb, int slot,
+static void file_extent_err(const struct extent_buffer *eb, int slot,
const char *fmt, ...)
{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
struct btrfs_key key;
struct va_format vaf;
va_list args;
@@ -94,38 +98,145 @@ static void file_extent_err(const struct btrfs_fs_info *fs_info,
* Return 0 if the btrfs_file_extent_##name is aligned to @alignment
* Else return 1
*/
-#define CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, name, alignment) \
+#define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment) \
({ \
if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
- file_extent_err((fs_info), (leaf), (slot), \
+ file_extent_err((leaf), (slot), \
"invalid %s for file extent, have %llu, should be aligned to %u", \
(#name), btrfs_file_extent_##name((leaf), (fi)), \
(alignment)); \
(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \
})
-static int check_extent_data_item(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+static u64 file_extent_end(struct extent_buffer *leaf,
+ struct btrfs_key *key,
+ struct btrfs_file_extent_item *extent)
+{
+ u64 end;
+ u64 len;
+
+ if (btrfs_file_extent_type(leaf, extent) == BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_ram_bytes(leaf, extent);
+ end = ALIGN(key->offset + len, leaf->fs_info->sectorsize);
+ } else {
+ len = btrfs_file_extent_num_bytes(leaf, extent);
+ end = key->offset + len;
+ }
+ return end;
+}
+
+/*
+ * Customized report for dir_item, the only new important information is
+ * key->objectid, which represents inode number
+ */
+__printf(3, 4)
+__cold
+static void dir_item_err(const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, &vaf);
+ va_end(args);
+}
+
+/*
+ * This functions checks prev_key->objectid, to ensure current key and prev_key
+ * share the same objectid as inode number.
+ *
+ * This is to detect missing INODE_ITEM in subvolume trees.
+ *
+ * Return true if everything is OK or we don't need to check.
+ * Return false if anything is wrong.
+ */
+static bool check_prev_ino(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
+{
+ /* No prev key, skip check */
+ if (slot == 0)
+ return true;
+
+ /* Only these key->types needs to be checked */
+ ASSERT(key->type == BTRFS_XATTR_ITEM_KEY ||
+ key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_DIR_INDEX_KEY ||
+ key->type == BTRFS_DIR_ITEM_KEY ||
+ key->type == BTRFS_EXTENT_DATA_KEY);
+
+ /*
+ * Only subvolume trees along with their reloc trees need this check.
+ * Things like log tree doesn't follow this ino requirement.
+ */
+ if (!is_fstree(btrfs_header_owner(leaf)))
+ return true;
+
+ if (key->objectid == prev_key->objectid)
+ return true;
+
+ /* Error found */
+ dir_item_err(leaf, slot,
+ "invalid previous key objectid, have %llu expect %llu",
+ prev_key->objectid, key->objectid);
+ return false;
+}
+static int check_extent_data_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_file_extent_item *fi;
u32 sectorsize = fs_info->sectorsize;
u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u64 extent_end;
if (!IS_ALIGNED(key->offset, sectorsize)) {
- file_extent_err(fs_info, leaf, slot,
+ file_extent_err(leaf, slot,
"unaligned file_offset for file extent, have %llu should be aligned to %u",
key->offset, sectorsize);
return -EUCLEAN;
}
+ /*
+ * Previous key must have the same key->objectid (ino).
+ * It can be XATTR_ITEM, INODE_ITEM or just another EXTENT_DATA.
+ * But if objectids mismatch, it means we have a missing
+ * INODE_ITEM.
+ */
+ if (!check_prev_ino(leaf, key, slot, prev_key))
+ return -EUCLEAN;
+
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
- file_extent_err(fs_info, leaf, slot,
+ /*
+ * Make sure the item contains at least inline header, so the file
+ * extent type is not some garbage.
+ */
+ if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) {
+ file_extent_err(leaf, slot,
+ "invalid item size, have %u expect [%zu, %u)",
+ item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START,
+ SZ_4K);
+ return -EUCLEAN;
+ }
+ if (btrfs_file_extent_type(leaf, fi) >= BTRFS_NR_FILE_EXTENT_TYPES) {
+ file_extent_err(leaf, slot,
"invalid type for file extent, have %u expect range [0, %u]",
btrfs_file_extent_type(leaf, fi),
- BTRFS_FILE_EXTENT_TYPES);
+ BTRFS_NR_FILE_EXTENT_TYPES - 1);
return -EUCLEAN;
}
@@ -133,15 +244,15 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info,
* Support for new compression/encryption must introduce incompat flag,
* and must be caught in open_ctree().
*/
- if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
- file_extent_err(fs_info, leaf, slot,
+ if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES) {
+ file_extent_err(leaf, slot,
"invalid compression for file extent, have %u expect range [0, %u]",
btrfs_file_extent_compression(leaf, fi),
- BTRFS_COMPRESS_TYPES);
+ BTRFS_NR_COMPRESS_TYPES - 1);
return -EUCLEAN;
}
if (btrfs_file_extent_encryption(leaf, fi)) {
- file_extent_err(fs_info, leaf, slot,
+ file_extent_err(leaf, slot,
"invalid encryption for file extent, have %u expect 0",
btrfs_file_extent_encryption(leaf, fi));
return -EUCLEAN;
@@ -149,7 +260,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info,
if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
/* Inline extent must have 0 as key offset */
if (key->offset) {
- file_extent_err(fs_info, leaf, slot,
+ file_extent_err(leaf, slot,
"invalid file_offset for inline file extent, have %llu expect 0",
key->offset);
return -EUCLEAN;
@@ -163,7 +274,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info,
/* Uncompressed inline extent size must match item size */
if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
btrfs_file_extent_ram_bytes(leaf, fi)) {
- file_extent_err(fs_info, leaf, slot,
+ file_extent_err(leaf, slot,
"invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
btrfs_file_extent_ram_bytes(leaf, fi));
@@ -174,84 +285,105 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info,
/* Regular or preallocated extent has fixed item size */
if (item_size != sizeof(*fi)) {
- file_extent_err(fs_info, leaf, slot,
+ file_extent_err(leaf, slot,
"invalid item size for reg/prealloc file extent, have %u expect %zu",
item_size, sizeof(*fi));
return -EUCLEAN;
}
- if (CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, ram_bytes, sectorsize) ||
- CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_bytenr, sectorsize) ||
- CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_num_bytes, sectorsize) ||
- CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, offset, sectorsize) ||
- CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, num_bytes, sectorsize))
+ if (CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize))
+ return -EUCLEAN;
+
+ /* Catch extent end overflow */
+ if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi),
+ key->offset, &extent_end)) {
+ file_extent_err(leaf, slot,
+ "extent end overflow, have file offset %llu extent num bytes %llu",
+ key->offset,
+ btrfs_file_extent_num_bytes(leaf, fi));
return -EUCLEAN;
+ }
+
+ /*
+ * Check that no two consecutive file extent items, in the same leaf,
+ * present ranges that overlap each other.
+ */
+ if (slot > 0 &&
+ prev_key->objectid == key->objectid &&
+ prev_key->type == BTRFS_EXTENT_DATA_KEY) {
+ struct btrfs_file_extent_item *prev_fi;
+ u64 prev_end;
+
+ prev_fi = btrfs_item_ptr(leaf, slot - 1,
+ struct btrfs_file_extent_item);
+ prev_end = file_extent_end(leaf, prev_key, prev_fi);
+ if (prev_end > key->offset) {
+ file_extent_err(leaf, slot - 1,
+"file extent end range (%llu) goes beyond start offset (%llu) of the next file extent",
+ prev_end, key->offset);
+ return -EUCLEAN;
+ }
+ }
+
return 0;
}
-static int check_csum_item(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf, struct btrfs_key *key,
- int slot)
+static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot, struct btrfs_key *prev_key)
{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
u32 sectorsize = fs_info->sectorsize;
u32 csumsize = btrfs_super_csum_size(fs_info->super_copy);
if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
- generic_err(fs_info, leaf, slot,
+ generic_err(leaf, slot,
"invalid key objectid for csum item, have %llu expect %llu",
key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
return -EUCLEAN;
}
if (!IS_ALIGNED(key->offset, sectorsize)) {
- generic_err(fs_info, leaf, slot,
+ generic_err(leaf, slot,
"unaligned key offset for csum item, have %llu should be aligned to %u",
key->offset, sectorsize);
return -EUCLEAN;
}
if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
- generic_err(fs_info, leaf, slot,
+ generic_err(leaf, slot,
"unaligned item size for csum item, have %u should be aligned to %u",
btrfs_item_size_nr(leaf, slot), csumsize);
return -EUCLEAN;
}
+ if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) {
+ u64 prev_csum_end;
+ u32 prev_item_size;
+
+ prev_item_size = btrfs_item_size_nr(leaf, slot - 1);
+ prev_csum_end = (prev_item_size / csumsize) * sectorsize;
+ prev_csum_end += prev_key->offset;
+ if (prev_csum_end > key->offset) {
+ generic_err(leaf, slot - 1,
+"csum end range (%llu) goes beyond the start range (%llu) of the next csum item",
+ prev_csum_end, key->offset);
+ return -EUCLEAN;
+ }
+ }
return 0;
}
-/*
- * Customized reported for dir_item, only important new info is key->objectid,
- * which represents inode number
- */
-__printf(4, 5)
-__cold
-static void dir_item_err(const struct btrfs_fs_info *fs_info,
- const struct extent_buffer *eb, int slot,
- const char *fmt, ...)
-{
- struct btrfs_key key;
- struct va_format vaf;
- va_list args;
-
- btrfs_item_key_to_cpu(eb, &key, slot);
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- btrfs_crit(fs_info,
- "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
- btrfs_header_level(eb) == 0 ? "leaf" : "node",
- btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
- key.objectid, &vaf);
- va_end(args);
-}
-
-static int check_dir_item(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+static int check_dir_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, struct btrfs_key *prev_key,
+ int slot)
{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_dir_item *di;
u32 item_size = btrfs_item_size_nr(leaf, slot);
u32 cur = 0;
+ if (!check_prev_ino(leaf, key, slot, prev_key))
+ return -EUCLEAN;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
while (cur < item_size) {
u32 name_len;
@@ -263,7 +395,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info,
/* header itself should not cross item boundary */
if (cur + sizeof(*di) > item_size) {
- dir_item_err(fs_info, leaf, slot,
+ dir_item_err(leaf, slot,
"dir item header crosses item boundary, have %zu boundary %u",
cur + sizeof(*di), item_size);
return -EUCLEAN;
@@ -272,7 +404,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info,
/* dir type check */
dir_type = btrfs_dir_type(leaf, di);
if (dir_type >= BTRFS_FT_MAX) {
- dir_item_err(fs_info, leaf, slot,
+ dir_item_err(leaf, slot,
"invalid dir item type, have %u expect [0, %u)",
dir_type, BTRFS_FT_MAX);
return -EUCLEAN;
@@ -280,14 +412,14 @@ static int check_dir_item(struct btrfs_fs_info *fs_info,
if (key->type == BTRFS_XATTR_ITEM_KEY &&
dir_type != BTRFS_FT_XATTR) {
- dir_item_err(fs_info, leaf, slot,
+ dir_item_err(leaf, slot,
"invalid dir item type for XATTR key, have %u expect %u",
dir_type, BTRFS_FT_XATTR);
return -EUCLEAN;
}
if (dir_type == BTRFS_FT_XATTR &&
key->type != BTRFS_XATTR_ITEM_KEY) {
- dir_item_err(fs_info, leaf, slot,
+ dir_item_err(leaf, slot,
"xattr dir type found for non-XATTR key");
return -EUCLEAN;
}
@@ -300,13 +432,13 @@ static int check_dir_item(struct btrfs_fs_info *fs_info,
name_len = btrfs_dir_name_len(leaf, di);
data_len = btrfs_dir_data_len(leaf, di);
if (name_len > max_name_len) {
- dir_item_err(fs_info, leaf, slot,
+ dir_item_err(leaf, slot,
"dir item name len too long, have %u max %u",
name_len, max_name_len);
return -EUCLEAN;
}
if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) {
- dir_item_err(fs_info, leaf, slot,
+ dir_item_err(leaf, slot,
"dir item name and data len too long, have %u max %u",
name_len + data_len,
BTRFS_MAX_XATTR_SIZE(fs_info));
@@ -314,7 +446,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info,
}
if (data_len && dir_type != BTRFS_FT_XATTR) {
- dir_item_err(fs_info, leaf, slot,
+ dir_item_err(leaf, slot,
"dir item with invalid data len, have %u expect 0",
data_len);
return -EUCLEAN;
@@ -324,7 +456,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info,
/* header and name/data should not cross item boundary */
if (cur + total_size > item_size) {
- dir_item_err(fs_info, leaf, slot,
+ dir_item_err(leaf, slot,
"dir item data crosses item boundary, have %u boundary %u",
cur + total_size, item_size);
return -EUCLEAN;
@@ -342,7 +474,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info,
(unsigned long)(di + 1), name_len);
name_hash = btrfs_name_hash(namebuf, name_len);
if (key->offset != name_hash) {
- dir_item_err(fs_info, leaf, slot,
+ dir_item_err(leaf, slot,
"name hash mismatch with key, have 0x%016x expect 0x%016llx",
name_hash, key->offset);
return -EUCLEAN;
@@ -354,12 +486,12 @@ static int check_dir_item(struct btrfs_fs_info *fs_info,
return 0;
}
-__printf(4, 5)
+__printf(3, 4)
__cold
-static void block_group_err(const struct btrfs_fs_info *fs_info,
- const struct extent_buffer *eb, int slot,
+static void block_group_err(const struct extent_buffer *eb, int slot,
const char *fmt, ...)
{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
struct btrfs_key key;
struct va_format vaf;
va_list args;
@@ -378,8 +510,7 @@ static void block_group_err(const struct btrfs_fs_info *fs_info,
va_end(args);
}
-static int check_block_group_item(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf,
+static int check_block_group_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
struct btrfs_block_group_item bgi;
@@ -392,13 +523,13 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info,
* handle it. We care more about the size.
*/
if (key->offset == 0) {
- block_group_err(fs_info, leaf, slot,
+ block_group_err(leaf, slot,
"invalid block group size 0");
return -EUCLEAN;
}
if (item_size != sizeof(bgi)) {
- block_group_err(fs_info, leaf, slot,
+ block_group_err(leaf, slot,
"invalid item size, have %u expect %zu",
item_size, sizeof(bgi));
return -EUCLEAN;
@@ -406,25 +537,25 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info,
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
sizeof(bgi));
- if (btrfs_block_group_chunk_objectid(&bgi) !=
+ if (btrfs_stack_block_group_chunk_objectid(&bgi) !=
BTRFS_FIRST_CHUNK_TREE_OBJECTID) {
- block_group_err(fs_info, leaf, slot,
+ block_group_err(leaf, slot,
"invalid block group chunk objectid, have %llu expect %llu",
- btrfs_block_group_chunk_objectid(&bgi),
+ btrfs_stack_block_group_chunk_objectid(&bgi),
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
return -EUCLEAN;
}
- if (btrfs_block_group_used(&bgi) > key->offset) {
- block_group_err(fs_info, leaf, slot,
+ if (btrfs_stack_block_group_used(&bgi) > key->offset) {
+ block_group_err(leaf, slot,
"invalid block group used, have %llu expect [0, %llu)",
- btrfs_block_group_used(&bgi), key->offset);
+ btrfs_stack_block_group_used(&bgi), key->offset);
return -EUCLEAN;
}
- flags = btrfs_block_group_flags(&bgi);
+ flags = btrfs_stack_block_group_flags(&bgi);
if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
- block_group_err(fs_info, leaf, slot,
+ block_group_err(leaf, slot,
"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK));
@@ -437,7 +568,7 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info,
type != BTRFS_BLOCK_GROUP_SYSTEM &&
type != (BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DATA)) {
- block_group_err(fs_info, leaf, slot,
+ block_group_err(leaf, slot,
"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
type, hweight64(type),
BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
@@ -448,37 +579,841 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info,
return 0;
}
+__printf(4, 5)
+__cold
+static void chunk_err(const struct extent_buffer *leaf,
+ const struct btrfs_chunk *chunk, u64 logical,
+ const char *fmt, ...)
+{
+ const struct btrfs_fs_info *fs_info = leaf->fs_info;
+ bool is_sb;
+ struct va_format vaf;
+ va_list args;
+ int i;
+ int slot = -1;
+
+ /* Only superblock eb is able to have such small offset */
+ is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET);
+
+ if (!is_sb) {
+ /*
+ * Get the slot number by iterating through all slots, this
+ * would provide better readability.
+ */
+ for (i = 0; i < btrfs_header_nritems(leaf); i++) {
+ if (btrfs_item_ptr_offset(leaf, i) ==
+ (unsigned long)chunk) {
+ slot = i;
+ break;
+ }
+ }
+ }
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (is_sb)
+ btrfs_crit(fs_info,
+ "corrupt superblock syschunk array: chunk_start=%llu, %pV",
+ logical, &vaf);
+ else
+ btrfs_crit(fs_info,
+ "corrupt leaf: root=%llu block=%llu slot=%d chunk_start=%llu, %pV",
+ BTRFS_CHUNK_TREE_OBJECTID, leaf->start, slot,
+ logical, &vaf);
+ va_end(args);
+}
+
+/*
+ * The common chunk check which could also work on super block sys chunk array.
+ *
+ * Return -EUCLEAN if anything is corrupted.
+ * Return 0 if everything is OK.
+ */
+int btrfs_check_chunk_valid(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 logical)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ u64 length;
+ u64 stripe_len;
+ u16 num_stripes;
+ u16 sub_stripes;
+ u64 type;
+ u64 features;
+ bool mixed = false;
+
+ length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
+
+ if (!num_stripes) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk num_stripes, have %u", num_stripes);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk logical, have %llu should aligned to %u",
+ logical, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk sectorsize, have %u expect %u",
+ btrfs_chunk_sector_size(leaf, chunk),
+ fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk length, have %llu", length);
+ return -EUCLEAN;
+ }
+ if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk stripe length: %llu",
+ stripe_len);
+ return -EUCLEAN;
+ }
+ if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+ type) {
+ chunk_err(leaf, chunk, logical,
+ "unrecognized chunk type: 0x%llx",
+ ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+ BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+ btrfs_chunk_type(leaf, chunk));
+ return -EUCLEAN;
+ }
+
+ if (!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
+ type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+ return -EUCLEAN;
+ }
+ if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
+ chunk_err(leaf, chunk, logical,
+ "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
+ type, BTRFS_BLOCK_GROUP_TYPE_MASK);
+ return -EUCLEAN;
+ }
+
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
+ chunk_err(leaf, chunk, logical,
+ "system chunk with data or metadata type: 0x%llx",
+ type);
+ return -EUCLEAN;
+ }
+
+ features = btrfs_super_incompat_flags(fs_info->super_copy);
+ if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ mixed = true;
+
+ if (!mixed) {
+ if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
+ (type & BTRFS_BLOCK_GROUP_DATA)) {
+ chunk_err(leaf, chunk, logical,
+ "mixed chunk type in non-mixed mode: 0x%llx", type);
+ return -EUCLEAN;
+ }
+ }
+
+ if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
+ (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
+ ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid num_stripes:sub_stripes %u:%u for profile %llu",
+ num_stripes, sub_stripes,
+ type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
+__printf(3, 4)
+__cold
+static void dev_item_err(const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(eb->fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, &vaf);
+ va_end(args);
+}
+
+static int check_dev_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_dev_item *ditem;
+
+ if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) {
+ dev_item_err(leaf, slot,
+ "invalid objectid: has=%llu expect=%llu",
+ key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
+ return -EUCLEAN;
+ }
+ ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
+ if (btrfs_device_id(leaf, ditem) != key->offset) {
+ dev_item_err(leaf, slot,
+ "devid mismatch: key has=%llu item has=%llu",
+ key->offset, btrfs_device_id(leaf, ditem));
+ return -EUCLEAN;
+ }
+
+ /*
+ * For device total_bytes, we don't have reliable way to check it, as
+ * it can be 0 for device removal. Device size check can only be done
+ * by dev extents check.
+ */
+ if (btrfs_device_bytes_used(leaf, ditem) >
+ btrfs_device_total_bytes(leaf, ditem)) {
+ dev_item_err(leaf, slot,
+ "invalid bytes used: have %llu expect [0, %llu]",
+ btrfs_device_bytes_used(leaf, ditem),
+ btrfs_device_total_bytes(leaf, ditem));
+ return -EUCLEAN;
+ }
+ /*
+ * Remaining members like io_align/type/gen/dev_group aren't really
+ * utilized. Skip them to make later usage of them easier.
+ */
+ return 0;
+}
+
+/* Inode item error output has the same format as dir_item_err() */
+#define inode_item_err(fs_info, eb, slot, fmt, ...) \
+ dir_item_err(eb, slot, fmt, __VA_ARGS__)
+
+static int check_inode_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_inode_item *iitem;
+ u64 super_gen = btrfs_super_generation(fs_info->super_copy);
+ u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
+ u32 mode;
+
+ if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
+ key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
+ key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
+ key->objectid != BTRFS_FREE_INO_OBJECTID) {
+ generic_err(leaf, slot,
+ "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
+ key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
+ BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_LAST_FREE_OBJECTID,
+ BTRFS_FREE_INO_OBJECTID);
+ return -EUCLEAN;
+ }
+ if (key->offset != 0) {
+ inode_item_err(fs_info, leaf, slot,
+ "invalid key offset: has %llu expect 0",
+ key->offset);
+ return -EUCLEAN;
+ }
+ iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
+
+ /* Here we use super block generation + 1 to handle log tree */
+ if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) {
+ inode_item_err(fs_info, leaf, slot,
+ "invalid inode generation: has %llu expect (0, %llu]",
+ btrfs_inode_generation(leaf, iitem),
+ super_gen + 1);
+ return -EUCLEAN;
+ }
+ /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
+ if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
+ inode_item_err(fs_info, leaf, slot,
+ "invalid inode generation: has %llu expect [0, %llu]",
+ btrfs_inode_transid(leaf, iitem), super_gen + 1);
+ return -EUCLEAN;
+ }
+
+ /*
+ * For size and nbytes it's better not to be too strict, as for dir
+ * item its size/nbytes can easily get wrong, but doesn't affect
+ * anything in the fs. So here we skip the check.
+ */
+ mode = btrfs_inode_mode(leaf, iitem);
+ if (mode & ~valid_mask) {
+ inode_item_err(fs_info, leaf, slot,
+ "unknown mode bit detected: 0x%x",
+ mode & ~valid_mask);
+ return -EUCLEAN;
+ }
+
+ /*
+ * S_IFMT is not bit mapped so we can't completely rely on
+ * is_power_of_2/has_single_bit_set, but it can save us from checking
+ * FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS
+ */
+ if (!has_single_bit_set(mode & S_IFMT)) {
+ if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) {
+ inode_item_err(fs_info, leaf, slot,
+ "invalid mode: has 0%o expect valid S_IF* bit(s)",
+ mode & S_IFMT);
+ return -EUCLEAN;
+ }
+ }
+ if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) {
+ inode_item_err(fs_info, leaf, slot,
+ "invalid nlink: has %u expect no more than 1 for dir",
+ btrfs_inode_nlink(leaf, iitem));
+ return -EUCLEAN;
+ }
+ if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) {
+ inode_item_err(fs_info, leaf, slot,
+ "unknown flags detected: 0x%llx",
+ btrfs_inode_flags(leaf, iitem) &
+ ~BTRFS_INODE_FLAG_MASK);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_root_item ri;
+ const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
+ BTRFS_ROOT_SUBVOL_DEAD;
+
+ /* No such tree id */
+ if (key->objectid == 0) {
+ generic_err(leaf, slot, "invalid root id 0");
+ return -EUCLEAN;
+ }
+
+ /*
+ * Some older kernel may create ROOT_ITEM with non-zero offset, so here
+ * we only check offset for reloc tree whose key->offset must be a
+ * valid tree.
+ */
+ if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) {
+ generic_err(leaf, slot, "invalid root id 0 for reloc tree");
+ return -EUCLEAN;
+ }
+
+ if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) {
+ generic_err(leaf, slot,
+ "invalid root item size, have %u expect %zu",
+ btrfs_item_size_nr(leaf, slot), sizeof(ri));
+ }
+
+ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(ri));
+
+ /* Generation related */
+ if (btrfs_root_generation(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1) {
+ generic_err(leaf, slot,
+ "invalid root generation, have %llu expect (0, %llu]",
+ btrfs_root_generation(&ri),
+ btrfs_super_generation(fs_info->super_copy) + 1);
+ return -EUCLEAN;
+ }
+ if (btrfs_root_generation_v2(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1) {
+ generic_err(leaf, slot,
+ "invalid root v2 generation, have %llu expect (0, %llu]",
+ btrfs_root_generation_v2(&ri),
+ btrfs_super_generation(fs_info->super_copy) + 1);
+ return -EUCLEAN;
+ }
+ if (btrfs_root_last_snapshot(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1) {
+ generic_err(leaf, slot,
+ "invalid root last_snapshot, have %llu expect (0, %llu]",
+ btrfs_root_last_snapshot(&ri),
+ btrfs_super_generation(fs_info->super_copy) + 1);
+ return -EUCLEAN;
+ }
+
+ /* Alignment and level check */
+ if (!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize)) {
+ generic_err(leaf, slot,
+ "invalid root bytenr, have %llu expect to be aligned to %u",
+ btrfs_root_bytenr(&ri), fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ if (btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL) {
+ generic_err(leaf, slot,
+ "invalid root level, have %u expect [0, %u]",
+ btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
+ if (ri.drop_level >= BTRFS_MAX_LEVEL) {
+ generic_err(leaf, slot,
+ "invalid root level, have %u expect [0, %u]",
+ ri.drop_level, BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
+
+ /* Flags check */
+ if (btrfs_root_flags(&ri) & ~valid_root_flags) {
+ generic_err(leaf, slot,
+ "invalid root flags, have 0x%llx expect mask 0x%llx",
+ btrfs_root_flags(&ri), valid_root_flags);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+__printf(3,4)
+__cold
+static void extent_err(const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+ u64 bytenr;
+ u64 len;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ bytenr = key.objectid;
+ if (key.type == BTRFS_METADATA_ITEM_KEY ||
+ key.type == BTRFS_TREE_BLOCK_REF_KEY ||
+ key.type == BTRFS_SHARED_BLOCK_REF_KEY)
+ len = eb->fs_info->nodesize;
+ else
+ len = key.offset;
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(eb->fs_info,
+ "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ eb->start, slot, bytenr, len, &vaf);
+ va_end(args);
+}
+
+static int check_extent_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_extent_item *ei;
+ bool is_tree_block = false;
+ unsigned long ptr; /* Current pointer inside inline refs */
+ unsigned long end; /* Extent item end */
+ const u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u64 flags;
+ u64 generation;
+ u64 total_refs; /* Total refs in btrfs_extent_item */
+ u64 inline_refs = 0; /* found total inline refs */
+
+ if (key->type == BTRFS_METADATA_ITEM_KEY &&
+ !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
+ generic_err(leaf, slot,
+"invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled");
+ return -EUCLEAN;
+ }
+ /* key->objectid is the bytenr for both key types */
+ if (!IS_ALIGNED(key->objectid, fs_info->sectorsize)) {
+ generic_err(leaf, slot,
+ "invalid key objectid, have %llu expect to be aligned to %u",
+ key->objectid, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+
+ /* key->offset is tree level for METADATA_ITEM_KEY */
+ if (key->type == BTRFS_METADATA_ITEM_KEY &&
+ key->offset >= BTRFS_MAX_LEVEL) {
+ extent_err(leaf, slot,
+ "invalid tree level, have %llu expect [0, %u]",
+ key->offset, BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
+
+ /*
+ * EXTENT/METADATA_ITEM consists of:
+ * 1) One btrfs_extent_item
+ * Records the total refs, type and generation of the extent.
+ *
+ * 2) One btrfs_tree_block_info (for EXTENT_ITEM and tree backref only)
+ * Records the first key and level of the tree block.
+ *
+ * 2) Zero or more btrfs_extent_inline_ref(s)
+ * Each inline ref has one btrfs_extent_inline_ref shows:
+ * 2.1) The ref type, one of the 4
+ * TREE_BLOCK_REF Tree block only
+ * SHARED_BLOCK_REF Tree block only
+ * EXTENT_DATA_REF Data only
+ * SHARED_DATA_REF Data only
+ * 2.2) Ref type specific data
+ * Either using btrfs_extent_inline_ref::offset, or specific
+ * data structure.
+ */
+ if (item_size < sizeof(*ei)) {
+ extent_err(leaf, slot,
+ "invalid item size, have %u expect [%zu, %u)",
+ item_size, sizeof(*ei),
+ BTRFS_LEAF_DATA_SIZE(fs_info));
+ return -EUCLEAN;
+ }
+ end = item_size + btrfs_item_ptr_offset(leaf, slot);
+
+ /* Checks against extent_item */
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+ total_refs = btrfs_extent_refs(leaf, ei);
+ generation = btrfs_extent_generation(leaf, ei);
+ if (generation > btrfs_super_generation(fs_info->super_copy) + 1) {
+ extent_err(leaf, slot,
+ "invalid generation, have %llu expect (0, %llu]",
+ generation,
+ btrfs_super_generation(fs_info->super_copy) + 1);
+ return -EUCLEAN;
+ }
+ if (!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA |
+ BTRFS_EXTENT_FLAG_TREE_BLOCK))) {
+ extent_err(leaf, slot,
+ "invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx",
+ flags, BTRFS_EXTENT_FLAG_DATA |
+ BTRFS_EXTENT_FLAG_TREE_BLOCK);
+ return -EUCLEAN;
+ }
+ is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK);
+ if (is_tree_block) {
+ if (key->type == BTRFS_EXTENT_ITEM_KEY &&
+ key->offset != fs_info->nodesize) {
+ extent_err(leaf, slot,
+ "invalid extent length, have %llu expect %u",
+ key->offset, fs_info->nodesize);
+ return -EUCLEAN;
+ }
+ } else {
+ if (key->type != BTRFS_EXTENT_ITEM_KEY) {
+ extent_err(leaf, slot,
+ "invalid key type, have %u expect %u for data backref",
+ key->type, BTRFS_EXTENT_ITEM_KEY);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(key->offset, fs_info->sectorsize)) {
+ extent_err(leaf, slot,
+ "invalid extent length, have %llu expect aligned to %u",
+ key->offset, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ }
+ ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1);
+
+ /* Check the special case of btrfs_tree_block_info */
+ if (is_tree_block && key->type != BTRFS_METADATA_ITEM_KEY) {
+ struct btrfs_tree_block_info *info;
+
+ info = (struct btrfs_tree_block_info *)ptr;
+ if (btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL) {
+ extent_err(leaf, slot,
+ "invalid tree block info level, have %u expect [0, %u]",
+ btrfs_tree_block_level(leaf, info),
+ BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
+ ptr = (unsigned long)(struct btrfs_tree_block_info *)(info + 1);
+ }
+
+ /* Check inline refs */
+ while (ptr < end) {
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ u64 dref_offset;
+ u64 inline_offset;
+ u8 inline_type;
+
+ if (ptr + sizeof(*iref) > end) {
+ extent_err(leaf, slot,
+"inline ref item overflows extent item, ptr %lu iref size %zu end %lu",
+ ptr, sizeof(*iref), end);
+ return -EUCLEAN;
+ }
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ inline_type = btrfs_extent_inline_ref_type(leaf, iref);
+ inline_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+ if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) {
+ extent_err(leaf, slot,
+"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
+ ptr, inline_type, end);
+ return -EUCLEAN;
+ }
+
+ switch (inline_type) {
+ /* inline_offset is subvolid of the owner, no need to check */
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ inline_refs++;
+ break;
+ /* Contains parent bytenr */
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) {
+ extent_err(leaf, slot,
+ "invalid tree parent bytenr, have %llu expect aligned to %u",
+ inline_offset, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ inline_refs++;
+ break;
+ /*
+ * Contains owner subvolid, owner key objectid, adjusted offset.
+ * The only obvious corruption can happen in that offset.
+ */
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ dref_offset = btrfs_extent_data_ref_offset(leaf, dref);
+ if (!IS_ALIGNED(dref_offset, fs_info->sectorsize)) {
+ extent_err(leaf, slot,
+ "invalid data ref offset, have %llu expect aligned to %u",
+ dref_offset, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ inline_refs += btrfs_extent_data_ref_count(leaf, dref);
+ break;
+ /* Contains parent bytenr and ref count */
+ case BTRFS_SHARED_DATA_REF_KEY:
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) {
+ extent_err(leaf, slot,
+ "invalid data parent bytenr, have %llu expect aligned to %u",
+ inline_offset, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ inline_refs += btrfs_shared_data_ref_count(leaf, sref);
+ break;
+ default:
+ extent_err(leaf, slot, "unknown inline ref type: %u",
+ inline_type);
+ return -EUCLEAN;
+ }
+ ptr += btrfs_extent_inline_ref_size(inline_type);
+ }
+ /* No padding is allowed */
+ if (ptr != end) {
+ extent_err(leaf, slot,
+ "invalid extent item size, padding bytes found");
+ return -EUCLEAN;
+ }
+
+ /* Finally, check the inline refs against total refs */
+ if (inline_refs > total_refs) {
+ extent_err(leaf, slot,
+ "invalid extent refs, have %llu expect >= inline %llu",
+ total_refs, inline_refs);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+static int check_simple_keyed_refs(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ u32 expect_item_size = 0;
+
+ if (key->type == BTRFS_SHARED_DATA_REF_KEY)
+ expect_item_size = sizeof(struct btrfs_shared_data_ref);
+
+ if (btrfs_item_size_nr(leaf, slot) != expect_item_size) {
+ generic_err(leaf, slot,
+ "invalid item size, have %u expect %u for key type %u",
+ btrfs_item_size_nr(leaf, slot),
+ expect_item_size, key->type);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
+ generic_err(leaf, slot,
+"invalid key objectid for shared block ref, have %llu expect aligned to %u",
+ key->objectid, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ if (key->type != BTRFS_TREE_BLOCK_REF_KEY &&
+ !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize)) {
+ extent_err(leaf, slot,
+ "invalid tree parent bytenr, have %llu expect aligned to %u",
+ key->offset, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+static int check_extent_data_ref(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_extent_data_ref *dref;
+ unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
+ const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot);
+
+ if (btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0) {
+ generic_err(leaf, slot,
+ "invalid item size, have %u expect aligned to %zu for key type %u",
+ btrfs_item_size_nr(leaf, slot),
+ sizeof(*dref), key->type);
+ }
+ if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
+ generic_err(leaf, slot,
+"invalid key objectid for shared block ref, have %llu expect aligned to %u",
+ key->objectid, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ for (; ptr < end; ptr += sizeof(*dref)) {
+ u64 root_objectid;
+ u64 owner;
+ u64 offset;
+ u64 hash;
+
+ dref = (struct btrfs_extent_data_ref *)ptr;
+ root_objectid = btrfs_extent_data_ref_root(leaf, dref);
+ owner = btrfs_extent_data_ref_objectid(leaf, dref);
+ offset = btrfs_extent_data_ref_offset(leaf, dref);
+ hash = hash_extent_data_ref(root_objectid, owner, offset);
+ if (hash != key->offset) {
+ extent_err(leaf, slot,
+ "invalid extent data ref hash, item has 0x%016llx key has 0x%016llx",
+ hash, key->offset);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(offset, leaf->fs_info->sectorsize)) {
+ extent_err(leaf, slot,
+ "invalid extent data backref offset, have %llu expect aligned to %u",
+ offset, leaf->fs_info->sectorsize);
+ }
+ }
+ return 0;
+}
+
+#define inode_ref_err(fs_info, eb, slot, fmt, args...) \
+ inode_item_err(fs_info, eb, slot, fmt, ##args)
+static int check_inode_ref(struct extent_buffer *leaf,
+ struct btrfs_key *key, struct btrfs_key *prev_key,
+ int slot)
+{
+ struct btrfs_inode_ref *iref;
+ unsigned long ptr;
+ unsigned long end;
+
+ if (!check_prev_ino(leaf, key, slot, prev_key))
+ return -EUCLEAN;
+ /* namelen can't be 0, so item_size == sizeof() is also invalid */
+ if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) {
+ inode_ref_err(fs_info, leaf, slot,
+ "invalid item size, have %u expect (%zu, %u)",
+ btrfs_item_size_nr(leaf, slot),
+ sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ return -EUCLEAN;
+ }
+
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ end = ptr + btrfs_item_size_nr(leaf, slot);
+ while (ptr < end) {
+ u16 namelen;
+
+ if (ptr + sizeof(iref) > end) {
+ inode_ref_err(fs_info, leaf, slot,
+ "inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
+ ptr, end, sizeof(iref));
+ return -EUCLEAN;
+ }
+
+ iref = (struct btrfs_inode_ref *)ptr;
+ namelen = btrfs_inode_ref_name_len(leaf, iref);
+ if (ptr + sizeof(*iref) + namelen > end) {
+ inode_ref_err(fs_info, leaf, slot,
+ "inode ref overflow, ptr %lu end %lu namelen %u",
+ ptr, end, namelen);
+ return -EUCLEAN;
+ }
+
+ /*
+ * NOTE: In theory we should record all found index numbers
+ * to find any duplicated indexes, but that will be too time
+ * consuming for inodes with too many hard links.
+ */
+ ptr += sizeof(*iref) + namelen;
+ }
+ return 0;
+}
+
/*
* Common point to switch the item-specific validation.
*/
-static int check_leaf_item(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+static int check_leaf_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
{
int ret = 0;
+ struct btrfs_chunk *chunk;
switch (key->type) {
case BTRFS_EXTENT_DATA_KEY:
- ret = check_extent_data_item(fs_info, leaf, key, slot);
+ ret = check_extent_data_item(leaf, key, slot, prev_key);
break;
case BTRFS_EXTENT_CSUM_KEY:
- ret = check_csum_item(fs_info, leaf, key, slot);
+ ret = check_csum_item(leaf, key, slot, prev_key);
break;
case BTRFS_DIR_ITEM_KEY:
case BTRFS_DIR_INDEX_KEY:
case BTRFS_XATTR_ITEM_KEY:
- ret = check_dir_item(fs_info, leaf, key, slot);
+ ret = check_dir_item(leaf, key, prev_key, slot);
+ break;
+ case BTRFS_INODE_REF_KEY:
+ ret = check_inode_ref(leaf, key, prev_key, slot);
break;
case BTRFS_BLOCK_GROUP_ITEM_KEY:
- ret = check_block_group_item(fs_info, leaf, key, slot);
+ ret = check_block_group_item(leaf, key, slot);
+ break;
+ case BTRFS_CHUNK_ITEM_KEY:
+ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ ret = btrfs_check_chunk_valid(leaf, chunk, key->offset);
+ break;
+ case BTRFS_DEV_ITEM_KEY:
+ ret = check_dev_item(leaf, key, slot);
+ break;
+ case BTRFS_INODE_ITEM_KEY:
+ ret = check_inode_item(leaf, key, slot);
+ break;
+ case BTRFS_ROOT_ITEM_KEY:
+ ret = check_root_item(leaf, key, slot);
+ break;
+ case BTRFS_EXTENT_ITEM_KEY:
+ case BTRFS_METADATA_ITEM_KEY:
+ ret = check_extent_item(leaf, key, slot);
+ break;
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ case BTRFS_SHARED_DATA_REF_KEY:
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = check_simple_keyed_refs(leaf, key, slot);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ ret = check_extent_data_ref(leaf, key, slot);
break;
}
return ret;
}
-static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
- bool check_item_data)
+static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
/* No valid key type is 0, so all key should be larger than this key */
struct btrfs_key prev_key = {0, 0, 0};
struct btrfs_key key;
@@ -486,7 +1421,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
int slot;
if (btrfs_header_level(leaf) != 0) {
- generic_err(fs_info, leaf, 0,
+ generic_err(leaf, 0,
"invalid level for leaf, have %d expect 0",
btrfs_header_level(leaf));
return -EUCLEAN;
@@ -502,7 +1437,6 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
*/
if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
u64 owner = btrfs_header_owner(leaf);
- struct btrfs_root *check_root;
/* These trees must never be empty */
if (owner == BTRFS_ROOT_TREE_OBJECTID ||
@@ -511,33 +1445,16 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
owner == BTRFS_DEV_TREE_OBJECTID ||
owner == BTRFS_FS_TREE_OBJECTID ||
owner == BTRFS_DATA_RELOC_TREE_OBJECTID) {
- generic_err(fs_info, leaf, 0,
+ generic_err(leaf, 0,
"invalid root, root %llu must never be empty",
owner);
return -EUCLEAN;
}
- key.objectid = owner;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- check_root = btrfs_get_fs_root(fs_info, &key, false);
- /*
- * The only reason we also check NULL here is that during
- * open_ctree() some roots has not yet been set up.
- */
- if (!IS_ERR_OR_NULL(check_root)) {
- struct extent_buffer *eb;
-
- eb = btrfs_root_node(check_root);
- /* if leaf is the root, then it's fine */
- if (leaf != eb) {
- generic_err(fs_info, leaf, 0,
- "invalid nritems, have %u should not be 0 for non-root leaf",
- nritems);
- free_extent_buffer(eb);
- return -EUCLEAN;
- }
- free_extent_buffer(eb);
+ /* Unknown tree */
+ if (owner == 0) {
+ generic_err(leaf, 0,
+ "invalid owner, root 0 is not defined");
+ return -EUCLEAN;
}
return 0;
}
@@ -564,7 +1481,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
/* Make sure the keys are in the right order */
if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
- generic_err(fs_info, leaf, slot,
+ generic_err(leaf, slot,
"bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
prev_key.objectid, prev_key.type,
prev_key.offset, key.objectid, key.type,
@@ -583,7 +1500,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
item_end_expected = btrfs_item_offset_nr(leaf,
slot - 1);
if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
- generic_err(fs_info, leaf, slot,
+ generic_err(leaf, slot,
"unexpected item end, have %u expect %u",
btrfs_item_end_nr(leaf, slot),
item_end_expected);
@@ -597,7 +1514,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
*/
if (btrfs_item_end_nr(leaf, slot) >
BTRFS_LEAF_DATA_SIZE(fs_info)) {
- generic_err(fs_info, leaf, slot,
+ generic_err(leaf, slot,
"slot end outside of leaf, have %u expect range [0, %u]",
btrfs_item_end_nr(leaf, slot),
BTRFS_LEAF_DATA_SIZE(fs_info));
@@ -607,7 +1524,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
/* Also check if the item pointer overlaps with btrfs item. */
if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
btrfs_item_ptr_offset(leaf, slot)) {
- generic_err(fs_info, leaf, slot,
+ generic_err(leaf, slot,
"slot overlaps with its data, item end %lu data start %lu",
btrfs_item_nr_offset(slot) +
sizeof(struct btrfs_item),
@@ -620,7 +1537,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
* Check if the item size and content meet other
* criteria
*/
- ret = check_leaf_item(fs_info, leaf, &key, slot);
+ ret = check_leaf_item(leaf, &key, slot, &prev_key);
if (ret < 0)
return ret;
}
@@ -633,20 +1550,20 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
return 0;
}
-int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf)
+int btrfs_check_leaf_full(struct extent_buffer *leaf)
{
- return check_leaf(fs_info, leaf, true);
+ return check_leaf(leaf, true);
}
+ALLOW_ERROR_INJECTION(btrfs_check_leaf_full, ERRNO);
-int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf)
+int btrfs_check_leaf_relaxed(struct extent_buffer *leaf)
{
- return check_leaf(fs_info, leaf, false);
+ return check_leaf(leaf, false);
}
-int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
+int btrfs_check_node(struct extent_buffer *node)
{
+ struct btrfs_fs_info *fs_info = node->fs_info;
unsigned long nr = btrfs_header_nritems(node);
struct btrfs_key key, next_key;
int slot;
@@ -655,7 +1572,7 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
int ret = 0;
if (level <= 0 || level >= BTRFS_MAX_LEVEL) {
- generic_err(fs_info, node, 0,
+ generic_err(node, 0,
"invalid level for node, have %d expect [1, %d]",
level, BTRFS_MAX_LEVEL - 1);
return -EUCLEAN;
@@ -675,13 +1592,13 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
btrfs_node_key_to_cpu(node, &next_key, slot + 1);
if (!bytenr) {
- generic_err(fs_info, node, slot,
+ generic_err(node, slot,
"invalid NULL node pointer");
ret = -EUCLEAN;
goto out;
}
if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) {
- generic_err(fs_info, node, slot,
+ generic_err(node, slot,
"unaligned pointer, have %llu should be aligned to %u",
bytenr, fs_info->sectorsize);
ret = -EUCLEAN;
@@ -689,7 +1606,7 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
}
if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
- generic_err(fs_info, node, slot,
+ generic_err(node, slot,
"bad key order, current (%llu %u %llu) next (%llu %u %llu)",
key.objectid, key.type, key.offset,
next_key.objectid, next_key.type,
@@ -701,3 +1618,4 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
out:
return ret;
}
+ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index ff043275b784..32fecc9dc1dd 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -14,15 +14,16 @@
* Will check not only the item pointers, but also every possible member
* in item data.
*/
-int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf);
+int btrfs_check_leaf_full(struct extent_buffer *leaf);
/*
* Less strict leaf checker.
* Will only check item pointers, not reading item data.
*/
-int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf);
-int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node);
+int btrfs_check_leaf_relaxed(struct extent_buffer *leaf);
+int btrfs_check_node(struct extent_buffer *node);
+
+int btrfs_check_chunk_valid(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 logical);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 561884f60d35..d3f115909ff0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -8,6 +8,7 @@
#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include <linux/iversion.h>
+#include "misc.h"
#include "ctree.h"
#include "tree-log.h"
#include "disk-io.h"
@@ -24,10 +25,12 @@
* LOG_INODE_EXISTS means to log just enough to recreate the inode
* during log replay
*/
-#define LOG_INODE_ALL 0
-#define LOG_INODE_EXISTS 1
-#define LOG_OTHER_INODE 2
-#define LOG_OTHER_INODE_ALL 3
+enum {
+ LOG_INODE_ALL,
+ LOG_INODE_EXISTS,
+ LOG_OTHER_INODE,
+ LOG_OTHER_INODE_ALL,
+};
/*
* directory trouble cases
@@ -81,10 +84,12 @@
* The last stage is to deal with directories and links and extents
* and all the other fun semantics
*/
-#define LOG_WALK_PIN_ONLY 0
-#define LOG_WALK_REPLAY_INODES 1
-#define LOG_WALK_REPLAY_DIR_INDEX 2
-#define LOG_WALK_REPLAY_ALL 3
+enum {
+ LOG_WALK_PIN_ONLY,
+ LOG_WALK_REPLAY_INODES,
+ LOG_WALK_REPLAY_DIR_INDEX,
+ LOG_WALK_REPLAY_ALL,
+};
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
@@ -139,7 +144,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
mutex_lock(&root->log_mutex);
if (root->log_root) {
- if (btrfs_need_log_full_commit(fs_info, trans)) {
+ if (btrfs_need_log_full_commit(trans)) {
ret = -EAGAIN;
goto out;
}
@@ -188,10 +193,6 @@ static int join_running_log_trans(struct btrfs_root *root)
{
int ret = -ENOENT;
- smp_mb();
- if (!root->log_root)
- return -ENOENT;
-
mutex_lock(&root->log_mutex);
if (root->log_root) {
ret = 0;
@@ -225,6 +226,17 @@ void btrfs_end_log_trans(struct btrfs_root *root)
}
}
+static int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+ return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
+ buf->start + buf->len - 1);
+}
+
+static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+ filemap_fdatawait_range(buf->pages[0]->mapping,
+ buf->start, buf->start + buf->len - 1);
+}
/*
* the walk control struct is used to pass state down the chain when
@@ -304,7 +316,7 @@ static int process_one_buffer(struct btrfs_root *log,
if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
if (wc->pin && btrfs_header_level(eb) == 0)
- ret = btrfs_exclude_logged_extents(fs_info, eb);
+ ret = btrfs_exclude_logged_extents(eb);
if (wc->write)
btrfs_write_tree_block(eb);
if (wc->wait)
@@ -333,7 +345,6 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
u32 item_size;
u64 saved_i_size = 0;
@@ -454,10 +465,9 @@ insert:
found_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
if (found_size > item_size)
- btrfs_truncate_item(fs_info, path, item_size, 1);
+ btrfs_truncate_item(path, item_size, 1);
else if (found_size < item_size)
- btrfs_extend_item(fs_info, path,
- item_size - found_size);
+ btrfs_extend_item(path, item_size - found_size);
} else if (ret) {
return ret;
}
@@ -496,7 +506,7 @@ insert:
ino_size != 0) {
struct btrfs_map_token token;
- btrfs_init_map_token(&token);
+ btrfs_init_map_token(&token, dst_eb);
btrfs_set_token_inode_size(dst_eb, dst_item,
ino_size, &token);
}
@@ -549,7 +559,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
key.objectid = objectid;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(root->fs_info->sb, &key, root);
if (IS_ERR(inode))
inode = NULL;
return inode;
@@ -694,9 +704,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
goto out;
if (ins.objectid > 0) {
+ struct btrfs_ref ref = { 0 };
u64 csum_start;
u64 csum_end;
LIST_HEAD(ordered_sums);
+
/*
* is this extent already allocated in the extent
* allocation tree? If so, just add a reference
@@ -704,10 +716,13 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset);
if (ret == 0) {
- ret = btrfs_inc_extent_ref(trans, root,
- ins.objectid, ins.offset,
- 0, root->root_key.objectid,
+ btrfs_init_generic_ref(&ref,
+ BTRFS_ADD_DELAYED_REF,
+ ins.objectid, ins.offset, 0);
+ btrfs_init_data_ref(&ref,
+ root->root_key.objectid,
key->objectid, offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
if (ret)
goto out;
} else {
@@ -793,7 +808,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_del_csums(trans, fs_info,
+ ret = btrfs_del_csums(trans,
+ fs_info->csum_root,
sums->bytenr,
sums->len);
if (!ret)
@@ -930,54 +946,32 @@ static noinline int backref_in_log(struct btrfs_root *log,
const char *name, int namelen)
{
struct btrfs_path *path;
- struct btrfs_inode_ref *ref;
- unsigned long ptr;
- unsigned long ptr_end;
- unsigned long name_ptr;
- int found_name_len;
- int item_size;
int ret;
- int match = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
- if (ret != 0)
+ if (ret < 0) {
goto out;
-
- ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
-
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
- if (btrfs_find_name_in_ext_backref(path->nodes[0],
- path->slots[0],
- ref_objectid,
- name, namelen, NULL))
- match = 1;
-
+ } else if (ret == 1) {
+ ret = 0;
goto out;
}
- item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- ref = (struct btrfs_inode_ref *)ptr;
- found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
- if (found_name_len == namelen) {
- name_ptr = (unsigned long)(ref + 1);
- ret = memcmp_extent_buffer(path->nodes[0], name,
- name_ptr, namelen);
- if (ret == 0) {
- match = 1;
- goto out;
- }
- }
- ptr = (unsigned long)(ref + 1) + found_name_len;
- }
+ if (key->type == BTRFS_INODE_EXTREF_KEY)
+ ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
+ path->slots[0],
+ ref_objectid,
+ name, namelen);
+ else
+ ret = !!btrfs_find_name_in_backref(path->nodes[0],
+ path->slots[0],
+ name, namelen);
out:
btrfs_free_path(path);
- return match;
+ return ret;
}
static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
@@ -1035,10 +1029,13 @@ again:
(unsigned long)(victim_ref + 1),
victim_name_len);
- if (!backref_in_log(log_root, &search_key,
- parent_objectid,
- victim_name,
- victim_name_len)) {
+ ret = backref_in_log(log_root, &search_key,
+ parent_objectid, victim_name,
+ victim_name_len);
+ if (ret < 0) {
+ kfree(victim_name);
+ return ret;
+ } else if (!ret) {
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
@@ -1100,10 +1097,12 @@ again:
search_key.offset = btrfs_extref_hash(parent_objectid,
victim_name,
victim_name_len);
- ret = 0;
- if (!backref_in_log(log_root, &search_key,
- parent_objectid, victim_name,
- victim_name_len)) {
+ ret = backref_in_log(log_root, &search_key,
+ parent_objectid, victim_name,
+ victim_name_len);
+ if (ret < 0) {
+ return ret;
+ } else if (!ret) {
ret = -ENOENT;
victim_parent = read_one_inode(root,
parent_objectid);
@@ -1252,12 +1251,12 @@ again:
goto out;
if (key->type == BTRFS_INODE_EXTREF_KEY)
- ret = btrfs_find_name_in_ext_backref(log_eb, log_slot,
- parent_id, name,
- namelen, NULL);
+ ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
+ parent_id, name,
+ namelen);
else
- ret = btrfs_find_name_in_backref(log_eb, log_slot, name,
- namelen, NULL);
+ ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
+ name, namelen);
if (!ret) {
struct inode *dir;
@@ -1319,12 +1318,11 @@ static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
goto out;
}
if (key.type == BTRFS_INODE_EXTREF_KEY)
- ret = btrfs_find_name_in_ext_backref(path->nodes[0],
- path->slots[0], parent_id,
- name, namelen, NULL);
+ ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
+ path->slots[0], parent_id, name, namelen);
else
- ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
- name, namelen, NULL);
+ ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+ name, namelen);
out:
btrfs_free_path(path);
@@ -1871,30 +1869,6 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
}
/*
- * Return true if an inode reference exists in the log for the given name,
- * inode and parent inode.
- */
-static bool name_in_log_ref(struct btrfs_root *log_root,
- const char *name, const int name_len,
- const u64 dirid, const u64 ino)
-{
- struct btrfs_key search_key;
-
- search_key.objectid = ino;
- search_key.type = BTRFS_INODE_REF_KEY;
- search_key.offset = dirid;
- if (backref_in_log(log_root, &search_key, dirid, name, name_len))
- return true;
-
- search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = btrfs_extref_hash(dirid, name, name_len);
- if (backref_in_log(log_root, &search_key, dirid, name, name_len))
- return true;
-
- return false;
-}
-
-/*
* take a single entry in a log directory item and replay it into
* the subvolume.
*
@@ -2010,8 +1984,31 @@ out:
return ret;
insert:
- if (name_in_log_ref(root->log_root, name, name_len,
- key->objectid, log_key.objectid)) {
+ /*
+ * Check if the inode reference exists in the log for the given name,
+ * inode and parent inode
+ */
+ found_key.objectid = log_key.objectid;
+ found_key.type = BTRFS_INODE_REF_KEY;
+ found_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ /* The dentry will be added later. */
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
+
+ found_key.objectid = log_key.objectid;
+ found_key.type = BTRFS_INODE_EXTREF_KEY;
+ found_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
+ name_len);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
/* The dentry will be added later. */
ret = 0;
update_size = false;
@@ -2725,7 +2722,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking_write(next);
- clean_tree_block(fs_info, next);
+ btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
} else {
@@ -2809,7 +2806,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking_write(next);
- clean_tree_block(fs_info, next);
+ btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
} else {
@@ -2855,7 +2852,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
level = btrfs_header_level(log->node);
orig_level = level;
path->nodes[level] = log->node;
- extent_buffer_get(log->node);
+ atomic_inc(&log->node->refs);
path->slots[level] = 0;
while (1) {
@@ -2891,7 +2888,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking_write(next);
- clean_tree_block(fs_info, next);
+ btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
} else {
@@ -2918,7 +2915,8 @@ out:
* in the tree of log roots
*/
static int update_log_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *log)
+ struct btrfs_root *log,
+ struct btrfs_root_item *root_item)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret;
@@ -2926,10 +2924,10 @@ static int update_log_root(struct btrfs_trans_handle *trans,
if (log->log_transid == 1) {
/* insert root item on the first sync */
ret = btrfs_insert_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
} else {
ret = btrfs_update_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
}
return ret;
}
@@ -3027,6 +3025,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ struct btrfs_root_item new_root_item;
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
@@ -3066,7 +3065,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
/* bail out if we need to do a full commit */
- if (btrfs_need_log_full_commit(fs_info, trans)) {
+ if (btrfs_need_log_full_commit(trans)) {
ret = -EAGAIN;
mutex_unlock(&root->log_mutex);
goto out;
@@ -3085,12 +3084,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (ret) {
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, ret);
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
mutex_unlock(&root->log_mutex);
goto out;
}
+ /*
+ * We _must_ update under the root->log_mutex in order to make sure we
+ * have a consistent view of the log root we are trying to commit at
+ * this moment.
+ *
+ * We _must_ copy this into a local copy, because we are not holding the
+ * log_root_tree->log_mutex yet. This is important because when we
+ * commit the log_root_tree we must have a consistent view of the
+ * log_root_tree when we update the super block to point at the
+ * log_root_tree bytenr. If we update the log_root_tree here we'll race
+ * with the commit and possibly point at the new block which we may not
+ * have written out.
+ */
btrfs_set_root_node(&log->root_item, log->node);
+ memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
root->log_transid++;
log->log_transid = root->log_transid;
@@ -3114,9 +3127,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
- ret = update_log_root(trans, log);
-
mutex_lock(&log_root_tree->log_mutex);
+
+ /*
+ * Now we are safe to update the log_root_tree because we're under the
+ * log_mutex, and we're a current writer so we're holding the commit
+ * open until we drop the log_mutex.
+ */
+ ret = update_log_root(trans, log, &new_root_item);
+
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
/* atomic_dec_and_test implies a barrier */
cond_wake_up_nomb(&log_root_tree->log_writer_wait);
@@ -3127,7 +3146,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
list_del_init(&root_log_ctx.list);
blk_finish_plug(&plug);
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
if (ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
@@ -3173,7 +3192,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
*/
- if (btrfs_need_log_full_commit(fs_info, trans)) {
+ if (btrfs_need_log_full_commit(trans)) {
blk_finish_plug(&plug);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
@@ -3186,7 +3205,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
EXTENT_DIRTY | EXTENT_NEW);
blk_finish_plug(&plug);
if (ret) {
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
@@ -3196,7 +3215,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_wait_tree_log_extents(log_root_tree,
EXTENT_NEW | EXTENT_DIRTY);
if (ret) {
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
@@ -3218,7 +3237,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
ret = write_all_supers(fs_info, 1);
if (ret) {
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
goto out_wake_log_root;
}
@@ -3305,6 +3324,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
}
/*
+ * Check if an inode was logged in the current transaction. We can't always rely
+ * on an inode's logged_trans value, because it's an in-memory only field and
+ * therefore not persisted. This means that its value is lost if the inode gets
+ * evicted and loaded again from disk (in which case it has a value of 0, and
+ * certainly it is smaller then any possible transaction ID), when that happens
+ * the full_sync flag is set in the inode's runtime flags, so on that case we
+ * assume eviction happened and ignore the logged_trans value, assuming the
+ * worst case, that the inode was logged before in the current transaction.
+ */
+static bool inode_logged(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ if (inode->logged_trans == trans->transid)
+ return true;
+
+ if (inode->last_trans == trans->transid &&
+ test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
+ return true;
+
+ return false;
+}
+
+/*
* If both a file and directory are logged, and unlinks or renames are
* mixed in, we have a few interesting corners:
*
@@ -3338,7 +3381,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
int bytes_del = 0;
u64 dir_ino = btrfs_ino(dir);
- if (dir->logged_trans < trans->transid)
+ if (!inode_logged(trans, dir))
return 0;
ret = join_running_log_trans(root);
@@ -3422,7 +3465,7 @@ fail:
out_unlock:
mutex_unlock(&dir->log_mutex);
if (ret == -ENOSPC) {
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_set_log_full_commit(trans);
ret = 0;
} else if (ret < 0)
btrfs_abort_transaction(trans, ret);
@@ -3438,12 +3481,11 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *inode, u64 dirid)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log;
u64 index;
int ret;
- if (inode->logged_trans < trans->transid)
+ if (!inode_logged(trans, inode))
return 0;
ret = join_running_log_trans(root);
@@ -3456,7 +3498,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
dirid, &index);
mutex_unlock(&inode->log_mutex);
if (ret == -ENOSPC) {
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
ret = 0;
} else if (ret < 0 && ret != -ENOENT)
btrfs_abort_transaction(trans, ret);
@@ -3801,7 +3843,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
{
struct btrfs_map_token token;
- btrfs_init_map_token(&token);
+ btrfs_init_map_token(&token, leaf);
if (log_inode_only) {
/* set the generation to zero so the recover code
@@ -3868,6 +3910,28 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
return 0;
}
+static int log_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log_root,
+ struct btrfs_ordered_sum *sums)
+{
+ int ret;
+
+ /*
+ * Due to extent cloning, we might have logged a csum item that covers a
+ * subrange of a cloned extent, and later we can end up logging a csum
+ * item for a larger subrange of the same extent or the entire range.
+ * This would leave csum items in the log tree that cover the same range
+ * and break the searches for checksums in the log tree, resulting in
+ * some checksums missing in the fs/subvolume tree. So just delete (or
+ * trim and adjust) any existing csum items in the log for this range.
+ */
+ ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
+ if (ret)
+ return ret;
+
+ return btrfs_csum_file_blocks(trans, log_root, sums);
+}
+
static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *dst_path,
@@ -4013,7 +4077,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_csum_file_blocks(trans, log, sums);
+ ret = log_csums(trans, log, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4169,6 +4233,7 @@ fill_holes:
*last_extent, 0,
0, len, 0, len,
0, 0, 0);
+ *last_extent += len;
}
}
}
@@ -4232,7 +4297,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_csum_file_blocks(trans, log_root, sums);
+ ret = log_csums(trans, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4260,8 +4325,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- btrfs_init_map_token(&token);
-
ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
em->start + em->len, NULL, 0, 1,
sizeof(*fi), &extent_inserted);
@@ -4279,6 +4342,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
return ret;
}
leaf = path->nodes[0];
+ btrfs_init_map_token(&token, leaf);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -4924,7 +4988,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
key.objectid = ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
@@ -4934,8 +4998,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
ret = PTR_ERR(inode);
if (ret == -ENOENT) {
key.objectid = parent;
- inode = btrfs_iget(fs_info->sb, &key, root,
- NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
} else {
@@ -4943,7 +5006,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
0, LLONG_MAX, ctx);
- iput(inode);
+ btrfs_add_delayed_iput(inode);
}
}
continue;
@@ -4958,7 +5021,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
if (ret) {
- iput(inode);
+ btrfs_add_delayed_iput(inode);
continue;
}
@@ -4967,7 +5030,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- iput(inode);
+ btrfs_add_delayed_iput(inode);
continue;
}
@@ -5014,7 +5077,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
}
path->slots[0]++;
}
- iput(inode);
+ btrfs_add_delayed_iput(inode);
}
return ret;
@@ -5402,9 +5465,19 @@ log_extents:
}
}
+ /*
+ * Don't update last_log_commit if we logged that an inode exists after
+ * it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode, then
+ * the inode gets evicted after all delalloc was flushed, then we log
+ * it exists (due to a rename for example) and then fsync it. This last
+ * fsync would do nothing (not logging the extents previously written).
+ */
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
- inode->last_log_commit = inode->last_sub_trans;
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
out_unlock:
mutex_unlock(&inode->log_mutex);
@@ -5442,7 +5515,7 @@ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
* Make sure any commits to the log are forced to be full
* commits.
*/
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
ret = true;
}
mutex_unlock(&inode->log_mutex);
@@ -5464,7 +5537,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
{
int ret = 0;
struct dentry *old_parent = NULL;
- struct btrfs_inode *orig_inode = inode;
/*
* for regular files, if its inode is already on disk, we don't
@@ -5484,16 +5556,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
}
while (1) {
- /*
- * If we are logging a directory then we start with our inode,
- * not our parent's inode, so we need to skip setting the
- * logged_trans so that further down in the log code we don't
- * think this inode has already been logged.
- */
- if (inode != orig_inode)
- inode->logged_trans = trans->transid;
- smp_mb();
-
if (btrfs_must_commit_transaction(trans, inode)) {
ret = 1;
break;
@@ -5641,14 +5703,14 @@ process_leaf:
continue;
btrfs_release_path(path);
- di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
+ di_inode = btrfs_iget(fs_info->sb, &di_key, root);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
goto next_dir_inode;
}
if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
- iput(di_inode);
+ btrfs_add_delayed_iput(di_inode);
break;
}
@@ -5660,7 +5722,7 @@ process_leaf:
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
ret = 1;
- iput(di_inode);
+ btrfs_add_delayed_iput(di_inode);
if (ret)
goto next_dir_inode;
if (ctx->log_new_dentries) {
@@ -5767,8 +5829,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
cur_offset = item_size;
}
- dir_inode = btrfs_iget(fs_info->sb, &inode_key,
- root, NULL);
+ dir_inode = btrfs_iget(fs_info->sb, &inode_key, root);
/*
* If the parent inode was deleted, return an error to
* fallback to a transaction commit. This is to prevent
@@ -5807,7 +5868,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (!ret && ctx && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, root,
BTRFS_I(dir_inode), ctx);
- iput(dir_inode);
+ btrfs_add_delayed_iput(dir_inode);
if (ret)
goto out;
}
@@ -5819,6 +5880,190 @@ out:
return ret;
}
+static int log_new_ancestors(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_key found_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+
+ while (true) {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ const u64 last_committed = fs_info->last_trans_committed;
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_key search_key;
+ struct inode *inode;
+ int ret = 0;
+
+ btrfs_release_path(path);
+
+ search_key.objectid = found_key.offset;
+ search_key.type = BTRFS_INODE_ITEM_KEY;
+ search_key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &search_key, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (BTRFS_I(inode)->generation > last_committed)
+ ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+ LOG_INODE_EXISTS,
+ 0, LLONG_MAX, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ return ret;
+
+ if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
+ break;
+
+ search_key.type = BTRFS_INODE_REF_KEY;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ return -ENOENT;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid != search_key.objectid ||
+ found_key.type != BTRFS_INODE_REF_KEY)
+ return -ENOENT;
+ }
+ return 0;
+}
+
+static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct dentry *parent,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct dentry *old_parent = NULL;
+ struct super_block *sb = inode->vfs_inode.i_sb;
+ int ret = 0;
+
+ while (true) {
+ if (!parent || d_really_is_negative(parent) ||
+ sb != parent->d_sb)
+ break;
+
+ inode = BTRFS_I(d_inode(parent));
+ if (root != inode->root)
+ break;
+
+ if (inode->generation > fs_info->last_trans_committed) {
+ ret = btrfs_log_inode(trans, root, inode,
+ LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+ if (ret)
+ break;
+ }
+ if (IS_ROOT(parent))
+ break;
+
+ parent = dget_parent(parent);
+ dput(old_parent);
+ old_parent = parent;
+ }
+ dput(old_parent);
+
+ return ret;
+}
+
+static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct dentry *parent,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_path *path;
+ struct btrfs_key search_key;
+ int ret;
+
+ /*
+ * For a single hard link case, go through a fast path that does not
+ * need to iterate the fs/subvolume tree.
+ */
+ if (inode->vfs_inode.i_nlink < 2)
+ return log_new_ancestors_fast(trans, inode, parent, ctx);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret == 0)
+ path->slots[0]++;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_key found_key;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid != ino ||
+ found_key.type > BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ /*
+ * Don't deal with extended references because they are rare
+ * cases and too complex to deal with (we would need to keep
+ * track of which subitem we are processing for each item in
+ * this loop, etc). So just return some error to fallback to
+ * a transaction commit.
+ */
+ if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = -EMLINK;
+ goto out;
+ }
+
+ /*
+ * Logging ancestors needs to do more searches on the fs/subvol
+ * tree, so it releases the path as needed to avoid deadlocks.
+ * Keep track of the last inode ref key and resume from that key
+ * after logging all new ancestors for the current hard link.
+ */
+ memcpy(&search_key, &found_key, sizeof(search_key));
+
+ ret = log_new_ancestors(trans, root, path, ctx);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ goto again;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -5836,11 +6081,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct super_block *sb;
- struct dentry *old_parent = NULL;
int ret = 0;
u64 last_committed = fs_info->last_trans_committed;
bool log_dentries = false;
- struct btrfs_inode *orig_inode = inode;
sb = inode->vfs_inode.i_sb;
@@ -5946,56 +6189,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* and has a link count of 2.
*/
if (inode->last_unlink_trans > last_committed) {
- ret = btrfs_log_all_parents(trans, orig_inode, ctx);
+ ret = btrfs_log_all_parents(trans, inode, ctx);
if (ret)
goto end_trans;
}
- /*
- * If a new hard link was added to the inode in the current transaction
- * and its link count is now greater than 1, we need to fallback to a
- * transaction commit, otherwise we can end up not logging all its new
- * parents for all the hard links. Here just from the dentry used to
- * fsync, we can not visit the ancestor inodes for all the other hard
- * links to figure out if any is new, so we fallback to a transaction
- * commit (instead of adding a lot of complexity of scanning a btree,
- * since this scenario is not a common use case).
- */
- if (inode->vfs_inode.i_nlink > 1 &&
- inode->last_link_trans > last_committed) {
- ret = -EMLINK;
+ ret = log_all_new_ancestors(trans, inode, parent, ctx);
+ if (ret)
goto end_trans;
- }
-
- while (1) {
- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
- break;
-
- inode = BTRFS_I(d_inode(parent));
- if (root != inode->root)
- break;
-
- if (inode->generation > last_committed) {
- ret = btrfs_log_inode(trans, root, inode,
- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
- if (ret)
- goto end_trans;
- }
- if (IS_ROOT(parent))
- break;
- parent = dget_parent(parent);
- dput(old_parent);
- old_parent = parent;
- }
if (log_dentries)
- ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+ ret = log_new_dir_dentries(trans, root, inode, ctx);
else
ret = 0;
end_trans:
- dput(old_parent);
if (ret < 0) {
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
ret = 1;
}
@@ -6044,7 +6253,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
struct walk_control wc = {
.process_func = process_one_buffer,
- .stage = 0,
+ .stage = LOG_WALK_PIN_ONLY,
};
path = btrfs_alloc_path();
@@ -6108,9 +6317,28 @@ again:
wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
+
+ /*
+ * We didn't find the subvol, likely because it was
+ * deleted. This is ok, simply skip this log and go to
+ * the next one.
+ *
+ * We need to exclude the root because we can't have
+ * other log replays overwriting this log as we'll read
+ * it back in a few more times. This will keep our
+ * block from being modified, and we'll just bail for
+ * each subsequent pass.
+ */
+ if (ret == -ENOENT)
+ ret = btrfs_pin_extent_for_log_replay(fs_info,
+ log->node->start,
+ log->node->len);
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
+
+ if (!ret)
+ goto next;
btrfs_handle_fs_error(fs_info, ret,
"Couldn't read target root for tree log recovery.");
goto error;
@@ -6142,7 +6370,6 @@ again:
&root->highest_objectid);
}
- key.offset = found_key.offset - 1;
wc.replay_dest->log_root = NULL;
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
@@ -6150,9 +6377,10 @@ again:
if (ret)
goto error;
-
+next:
if (found_key.offset == 0)
break;
+ key.offset = found_key.offset - 1;
}
btrfs_release_path(path);
@@ -6222,7 +6450,6 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
* if this directory was already logged any new
* names for this file/dir will get recorded
*/
- smp_mb();
if (dir->logged_trans == trans->transid)
return;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0fab84a8f670..132e43d29034 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -30,16 +30,14 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
INIT_LIST_HEAD(&ctx->list);
}
-static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans)
+static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
{
- WRITE_ONCE(fs_info->last_trans_log_full_commit, trans->transid);
+ WRITE_ONCE(trans->fs_info->last_trans_log_full_commit, trans->transid);
}
-static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans)
+static inline int btrfs_need_log_full_commit(struct btrfs_trans_handle *trans)
{
- return READ_ONCE(fs_info->last_trans_log_full_commit) ==
+ return READ_ONCE(trans->fs_info->last_trans_log_full_commit) ==
trans->transid;
}
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 3b2ae342e649..76b84f2397b1 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -121,12 +121,12 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
* An item with that type already exists.
* Extend the item and store the new subid at the end.
*/
- btrfs_extend_item(fs_info, path, sizeof(subid_le));
+ btrfs_extend_item(path, sizeof(subid_le));
eb = path->nodes[0];
slot = path->slots[0];
offset = btrfs_item_ptr_offset(eb, slot);
offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
- } else if (ret < 0) {
+ } else {
btrfs_warn(fs_info,
"insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
ret, (unsigned long long)key.objectid,
@@ -219,7 +219,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
move_src = offset + sizeof(subid);
move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot));
memmove_extent_buffer(eb, move_dst, move_src, move_len);
- btrfs_truncate_item(fs_info, path, item_size - sizeof(subid), 1);
+ btrfs_truncate_item(path, item_size - sizeof(subid), 1);
out:
btrfs_free_path(path);
@@ -324,6 +324,8 @@ again_search_slot:
}
if (ret < 0 && ret != -ENOENT)
goto out;
+ key.offset++;
+ goto again_search_slot;
}
item_size -= sizeof(subid_le);
offset += sizeof(subid_le);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index db934ceae9c1..9b78e720c697 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,6 +14,7 @@
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
+#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
@@ -24,9 +25,11 @@
#include "async-thread.h"
#include "check-integrity.h"
#include "rcu-string.h"
-#include "math.h"
#include "dev-replace.h"
#include "sysfs.h"
+#include "tree-checker.h"
+#include "space-info.h"
+#include "block-group.h"
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
@@ -55,6 +58,30 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.bg_flag = BTRFS_BLOCK_GROUP_RAID1,
.mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
},
+ [BTRFS_RAID_RAID1C3] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 3,
+ .devs_min = 3,
+ .tolerated_failures = 2,
+ .devs_increment = 3,
+ .ncopies = 3,
+ .raid_name = "raid1c3",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
+ .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
+ },
+ [BTRFS_RAID_RAID1C4] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 4,
+ .devs_min = 4,
+ .tolerated_failures = 3,
+ .devs_increment = 4,
+ .ncopies = 4,
+ .raid_name = "raid1c4",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
+ .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
+ },
[BTRFS_RAID_DUP] = {
.sub_stripes = 1,
.dev_stripes = 2,
@@ -122,12 +149,14 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
},
};
-const char *get_raid_name(enum btrfs_raid_types type)
+const char *btrfs_bg_type_to_raid_name(u64 flags)
{
- if (type >= BTRFS_NR_RAID_TYPES)
+ const int index = btrfs_bg_flags_to_raid_index(flags);
+
+ if (index >= BTRFS_NR_RAID_TYPES)
return NULL;
- return btrfs_raid_array[type].raid_name;
+ return btrfs_raid_array[index].raid_name;
}
/*
@@ -184,10 +213,8 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
out_overflow:;
}
-static int init_first_rw_device(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+static int init_first_rw_device(struct btrfs_trans_handle *trans);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
-static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
@@ -237,7 +264,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* chunk_mutex
* -----------
* protects chunks, adding or removing during allocation, trim or when a new
- * device is added/removed
+ * device is added/removed. Additionally it also protects post_commit_list of
+ * individual devices, since they can be added to the transaction's
+ * post_commit_list only with chunk_mutex held.
*
* cleaner_mutex
* -------------
@@ -292,7 +321,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
-struct list_head *btrfs_get_fs_uuids(void)
+struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
{
return &fs_uuids;
}
@@ -318,7 +347,6 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
mutex_init(&fs_devs->device_list_mutex);
INIT_LIST_HEAD(&fs_devs->devices);
- INIT_LIST_HEAD(&fs_devs->resized_devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->fs_list);
if (fsid)
@@ -334,7 +362,9 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
void btrfs_free_device(struct btrfs_device *device)
{
+ WARN_ON(!list_empty(&device->post_commit_list));
rcu_string_free(device->name);
+ extent_io_tree_release(&device->alloc_state);
bio_put(device->flush_bio);
kfree(device);
}
@@ -352,19 +382,6 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
kfree(fs_devices);
}
-static void btrfs_kobject_uevent(struct block_device *bdev,
- enum kobject_action action)
-{
- int ret;
-
- ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
- if (ret)
- pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
- action,
- kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
- &disk_to_dev(bdev->bd_disk)->kobj);
-}
-
void __exit btrfs_cleanup_fs_uuids(void)
{
struct btrfs_fs_devices *fs_devices;
@@ -402,15 +419,14 @@ static struct btrfs_device *__alloc_device(void)
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
- INIT_LIST_HEAD(&dev->resized_list);
-
- spin_lock_init(&dev->io_lock);
+ INIT_LIST_HEAD(&dev->post_commit_list);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
return dev;
}
@@ -507,212 +523,6 @@ error:
return ret;
}
-static void requeue_list(struct btrfs_pending_bios *pending_bios,
- struct bio *head, struct bio *tail)
-{
-
- struct bio *old_head;
-
- old_head = pending_bios->head;
- pending_bios->head = head;
- if (pending_bios->tail)
- tail->bi_next = old_head;
- else
- pending_bios->tail = tail;
-}
-
-/*
- * we try to collect pending bios for a device so we don't get a large
- * number of procs sending bios down to the same device. This greatly
- * improves the schedulers ability to collect and merge the bios.
- *
- * But, it also turns into a long list of bios to process and that is sure
- * to eventually make the worker thread block. The solution here is to
- * make some progress and then put this work struct back at the end of
- * the list if the block device is congested. This way, multiple devices
- * can make progress from a single worker thread.
- */
-static noinline void run_scheduled_bios(struct btrfs_device *device)
-{
- struct btrfs_fs_info *fs_info = device->fs_info;
- struct bio *pending;
- struct backing_dev_info *bdi;
- struct btrfs_pending_bios *pending_bios;
- struct bio *tail;
- struct bio *cur;
- int again = 0;
- unsigned long num_run;
- unsigned long batch_run = 0;
- unsigned long last_waited = 0;
- int force_reg = 0;
- int sync_pending = 0;
- struct blk_plug plug;
-
- /*
- * this function runs all the bios we've collected for
- * a particular device. We don't want to wander off to
- * another device without first sending all of these down.
- * So, setup a plug here and finish it off before we return
- */
- blk_start_plug(&plug);
-
- bdi = device->bdev->bd_bdi;
-
-loop:
- spin_lock(&device->io_lock);
-
-loop_lock:
- num_run = 0;
-
- /* take all the bios off the list at once and process them
- * later on (without the lock held). But, remember the
- * tail and other pointers so the bios can be properly reinserted
- * into the list if we hit congestion
- */
- if (!force_reg && device->pending_sync_bios.head) {
- pending_bios = &device->pending_sync_bios;
- force_reg = 1;
- } else {
- pending_bios = &device->pending_bios;
- force_reg = 0;
- }
-
- pending = pending_bios->head;
- tail = pending_bios->tail;
- WARN_ON(pending && !tail);
-
- /*
- * if pending was null this time around, no bios need processing
- * at all and we can stop. Otherwise it'll loop back up again
- * and do an additional check so no bios are missed.
- *
- * device->running_pending is used to synchronize with the
- * schedule_bio code.
- */
- if (device->pending_sync_bios.head == NULL &&
- device->pending_bios.head == NULL) {
- again = 0;
- device->running_pending = 0;
- } else {
- again = 1;
- device->running_pending = 1;
- }
-
- pending_bios->head = NULL;
- pending_bios->tail = NULL;
-
- spin_unlock(&device->io_lock);
-
- while (pending) {
-
- rmb();
- /* we want to work on both lists, but do more bios on the
- * sync list than the regular list
- */
- if ((num_run > 32 &&
- pending_bios != &device->pending_sync_bios &&
- device->pending_sync_bios.head) ||
- (num_run > 64 && pending_bios == &device->pending_sync_bios &&
- device->pending_bios.head)) {
- spin_lock(&device->io_lock);
- requeue_list(pending_bios, pending, tail);
- goto loop_lock;
- }
-
- cur = pending;
- pending = pending->bi_next;
- cur->bi_next = NULL;
-
- BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
-
- /*
- * if we're doing the sync list, record that our
- * plug has some sync requests on it
- *
- * If we're doing the regular list and there are
- * sync requests sitting around, unplug before
- * we add more
- */
- if (pending_bios == &device->pending_sync_bios) {
- sync_pending = 1;
- } else if (sync_pending) {
- blk_finish_plug(&plug);
- blk_start_plug(&plug);
- sync_pending = 0;
- }
-
- btrfsic_submit_bio(cur);
- num_run++;
- batch_run++;
-
- cond_resched();
-
- /*
- * we made progress, there is more work to do and the bdi
- * is now congested. Back off and let other work structs
- * run instead
- */
- if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
- fs_info->fs_devices->open_devices > 1) {
- struct io_context *ioc;
-
- ioc = current->io_context;
-
- /*
- * the main goal here is that we don't want to
- * block if we're going to be able to submit
- * more requests without blocking.
- *
- * This code does two great things, it pokes into
- * the elevator code from a filesystem _and_
- * it makes assumptions about how batching works.
- */
- if (ioc && ioc->nr_batch_requests > 0 &&
- time_before(jiffies, ioc->last_waited + HZ/50UL) &&
- (last_waited == 0 ||
- ioc->last_waited == last_waited)) {
- /*
- * we want to go through our batch of
- * requests and stop. So, we copy out
- * the ioc->last_waited time and test
- * against it before looping
- */
- last_waited = ioc->last_waited;
- cond_resched();
- continue;
- }
- spin_lock(&device->io_lock);
- requeue_list(pending_bios, pending, tail);
- device->running_pending = 1;
-
- spin_unlock(&device->io_lock);
- btrfs_queue_work(fs_info->submit_workers,
- &device->work);
- goto done;
- }
- }
-
- cond_resched();
- if (again)
- goto loop;
-
- spin_lock(&device->io_lock);
- if (device->pending_bios.head || device->pending_sync_bios.head)
- goto loop_lock;
- spin_unlock(&device->io_lock);
-
-done:
- blk_finish_plug(&plug);
-}
-
-static void pending_bios_fn(struct btrfs_work *work)
-{
- struct btrfs_device *device;
-
- device = container_of(work, struct btrfs_device, work);
- run_scheduled_bios(device);
-}
-
static bool device_path_matched(const char *path, struct btrfs_device *device)
{
int found;
@@ -824,7 +634,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- fs_devices->seeding = 1;
+ fs_devices->seeding = true;
} else {
if (bdev_read_only(bdev))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
@@ -834,7 +644,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
q = bdev_get_queue(bdev);
if (!blk_queue_nonrot(q))
- fs_devices->rotating = 1;
+ fs_devices->rotating = true;
device->bdev = bdev;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@ -1011,11 +821,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
*new_device_added = true;
if (disk_super->label[0])
- pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
- disk_super->label, devid, found_transid, path);
+ pr_info(
+ "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
+ disk_super->label, devid, found_transid, path,
+ current->comm, task_pid_nr(current));
else
- pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
- disk_super->fsid, devid, found_transid, path);
+ pr_info(
+ "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
+ disk_super->fsid, devid, found_transid, path,
+ current->comm, task_pid_nr(current));
} else if (!device->name || strcmp(device->name->str, path)) {
/*
@@ -1121,6 +935,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
struct btrfs_fs_devices *fs_devices;
struct btrfs_device *device;
struct btrfs_device *orig_dev;
+ int ret = 0;
fs_devices = alloc_fs_devices(orig->fsid, NULL);
if (IS_ERR(fs_devices))
@@ -1134,8 +949,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
device = btrfs_alloc_device(NULL, &orig_dev->devid,
orig_dev->uuid);
- if (IS_ERR(device))
+ if (IS_ERR(device)) {
+ ret = PTR_ERR(device);
goto error;
+ }
/*
* This is ok to do without rcu read locked because we hold the
@@ -1146,6 +963,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
GFP_KERNEL);
if (!name) {
btrfs_free_device(device);
+ ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
@@ -1160,7 +978,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
error:
mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
/*
@@ -1230,14 +1048,6 @@ again:
mutex_unlock(&uuid_mutex);
}
-static void free_device_rcu(struct rcu_head *head)
-{
- struct btrfs_device *device;
-
- device = container_of(head, struct btrfs_device, rcu);
- btrfs_free_device(device);
-}
-
static void btrfs_close_bdev(struct btrfs_device *device)
{
if (!device->bdev)
@@ -1285,7 +1095,8 @@ static void btrfs_close_one_device(struct btrfs_device *device)
list_replace_rcu(&device->dev_list, &new_device->dev_list);
new_device->fs_devices = device->fs_devices;
- call_rcu(&device->rcu, free_device_rcu);
+ synchronize_rcu();
+ btrfs_free_device(device);
}
static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
@@ -1304,7 +1115,7 @@ static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
- fs_devices->seeding = 0;
+ fs_devices->seeding = false;
return 0;
}
@@ -1505,58 +1316,29 @@ error_bdev_put:
return device;
}
-static int contains_pending_extent(struct btrfs_transaction *transaction,
- struct btrfs_device *device,
- u64 *start, u64 len)
+/*
+ * Try to find a chunk that intersects [start, start + len] range and when one
+ * such is found, record the end of it in *start
+ */
+static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
+ u64 len)
{
- struct btrfs_fs_info *fs_info = device->fs_info;
- struct extent_map *em;
- struct list_head *search_list = &fs_info->pinned_chunks;
- int ret = 0;
- u64 physical_start = *start;
+ u64 physical_start, physical_end;
- if (transaction)
- search_list = &transaction->pending_chunks;
-again:
- list_for_each_entry(em, search_list, list) {
- struct map_lookup *map;
- int i;
+ lockdep_assert_held(&device->fs_info->chunk_mutex);
- map = em->map_lookup;
- for (i = 0; i < map->num_stripes; i++) {
- u64 end;
+ if (!find_first_extent_bit(&device->alloc_state, *start,
+ &physical_start, &physical_end,
+ CHUNK_ALLOCATED, NULL)) {
- if (map->stripes[i].dev != device)
- continue;
- if (map->stripes[i].physical >= physical_start + len ||
- map->stripes[i].physical + em->orig_block_len <=
- physical_start)
- continue;
- /*
- * Make sure that while processing the pinned list we do
- * not override our *start with a lower value, because
- * we can have pinned chunks that fall within this
- * device hole and that have lower physical addresses
- * than the pending chunks we processed before. If we
- * do not take this special care we can end up getting
- * 2 pending chunks that start at the same physical
- * device offsets because the end offset of a pinned
- * chunk can be equal to the start offset of some
- * pending chunk.
- */
- end = map->stripes[i].physical + em->orig_block_len;
- if (end > *start) {
- *start = end;
- ret = 1;
- }
+ if (in_range(physical_start, *start, len) ||
+ in_range(*start, physical_start,
+ physical_end - physical_start)) {
+ *start = physical_end + 1;
+ return true;
}
}
- if (search_list != &fs_info->pinned_chunks) {
- search_list = &fs_info->pinned_chunks;
- goto again;
- }
-
- return ret;
+ return false;
}
@@ -1580,10 +1362,16 @@ again:
* @len is used to store the size of the free space that we find.
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
+ *
+ * NOTE: This function will search *commit* root of device tree, and does extra
+ * check to ensure dev extents are not double allocated.
+ * This makes the function safe to allocate dev extents but may not report
+ * correct usable device space, as device extent freed in current transaction
+ * is not reported as avaiable.
*/
-int find_free_dev_extent_start(struct btrfs_transaction *transaction,
- struct btrfs_device *device, u64 num_bytes,
- u64 search_start, u64 *start, u64 *len)
+static int find_free_dev_extent_start(struct btrfs_device *device,
+ u64 num_bytes, u64 search_start, u64 *start,
+ u64 *len)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
@@ -1667,15 +1455,12 @@ again:
* Have to check before we set max_hole_start, otherwise
* we could end up sending back this offset anyway.
*/
- if (contains_pending_extent(transaction, device,
- &search_start,
+ if (contains_pending_extent(device, &search_start,
hole_size)) {
- if (key.offset >= search_start) {
+ if (key.offset >= search_start)
hole_size = key.offset - search_start;
- } else {
- WARN_ON_ONCE(1);
+ else
hole_size = 0;
- }
}
if (hole_size > max_hole_size) {
@@ -1716,8 +1501,7 @@ next:
if (search_end > search_start) {
hole_size = search_end - search_start;
- if (contains_pending_extent(transaction, device, &search_start,
- hole_size)) {
+ if (contains_pending_extent(device, &search_start, hole_size)) {
btrfs_release_path(path);
goto again;
}
@@ -1742,13 +1526,11 @@ out:
return ret;
}
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
/* FIXME use last free of some kind */
- return find_free_dev_extent_start(trans->transaction, device,
- num_bytes, 0, start, len);
+ return find_free_dev_extent_start(device, num_bytes, 0, start, len);
}
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
@@ -1859,7 +1641,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
struct rb_node *n;
u64 ret = 0;
- em_tree = &fs_info->mapping_tree.map_tree;
+ em_tree = &fs_info->mapping_tree;
read_lock(&em_tree->lock);
n = rb_last(&em_tree->map.rb_root);
if (n) {
@@ -1891,7 +1673,12 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
if (ret < 0)
goto error;
- BUG_ON(ret == 0); /* Corruption */
+ if (ret == 0) {
+ /* Corruption */
+ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
+ ret = -EUCLEAN;
+ goto error;
+ }
ret = btrfs_previous_item(fs_info->chunk_root, path,
BTRFS_DEV_ITEMS_OBJECTID,
@@ -1982,10 +1769,9 @@ static void update_dev_time(const char *path_name)
filp_close(filp, NULL);
}
-static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
- struct btrfs_device *device)
+static int btrfs_rm_dev_item(struct btrfs_device *device)
{
- struct btrfs_root *root = fs_info->chunk_root;
+ struct btrfs_root *root = device->fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
@@ -2082,7 +1868,7 @@ static struct btrfs_device * btrfs_find_next_active_device(
* where this function called, there should be always be another device (or
* this_dev) which is active.
*/
-void btrfs_assign_next_active_device(struct btrfs_device *device,
+void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev)
{
struct btrfs_fs_info *fs_info = device->fs_info;
@@ -2186,12 +1972,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
* counter although write_all_supers() is not locked out. This
* could give a filesystem state which requires a degraded mount.
*/
- ret = btrfs_rm_dev_item(fs_info, device);
+ ret = btrfs_rm_dev_item(device);
if (ret)
goto error_undo;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- btrfs_scrub_cancel_dev(fs_info, device);
+ btrfs_scrub_cancel_dev(device);
/*
* the device list mutex makes sure that we don't change
@@ -2242,7 +2028,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
btrfs_scratch_superblocks(device->bdev, device->name->str);
btrfs_close_bdev(device);
- call_rcu(&device->rcu, free_device_rcu);
+ synchronize_rcu();
+ btrfs_free_device(device);
if (cur_devices->open_devices == 0) {
while (fs_devices) {
@@ -2299,9 +2086,9 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
fs_devices->open_devices--;
}
-void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev)
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
{
+ struct btrfs_fs_info *fs_info = srcdev->fs_info;
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
@@ -2310,7 +2097,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
}
btrfs_close_bdev(srcdev);
- call_rcu(&srcdev->rcu, free_device_rcu);
+ synchronize_rcu();
+ btrfs_free_device(srcdev);
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
@@ -2368,7 +2156,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
btrfs_close_bdev(tgtdev);
- call_rcu(&tgtdev->rcu, free_device_rcu);
+ synchronize_rcu();
+ btrfs_free_device(tgtdev);
}
static struct btrfs_device *btrfs_find_device_by_path(
@@ -2481,11 +2270,11 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
mutex_unlock(&fs_info->chunk_mutex);
- fs_devices->seeding = 0;
+ fs_devices->seeding = false;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
- fs_devices->rotating = 0;
+ fs_devices->rotating = false;
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
@@ -2503,9 +2292,9 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
/*
* Store the expected generation for seed devices in device items.
*/
-static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -2680,7 +2469,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
if (!blk_queue_nonrot(q))
- fs_devices->rotating = 1;
+ fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
btrfs_set_super_total_bytes(fs_info->super_copy,
@@ -2705,7 +2494,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (seeding_dev) {
mutex_lock(&fs_info->chunk_mutex);
- ret = init_first_rw_device(trans, fs_info);
+ ret = init_first_rw_device(trans);
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -2720,22 +2509,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
if (seeding_dev) {
- char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
-
- ret = btrfs_finish_sprout(trans, fs_info);
+ ret = btrfs_finish_sprout(trans);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
- /* Sprouting would change fsid of the mounted root,
- * so rename the fsid on the sysfs
- */
- snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
- fs_info->fs_devices->fsid);
- if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
- btrfs_warn(fs_info,
- "sysfs: failed to create fsid for sprout");
+ btrfs_sysfs_update_sprout_fsid(fs_devices,
+ fs_info->fs_devices->fsid);
}
ret = btrfs_commit_transaction(trans);
@@ -2852,7 +2633,6 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_super_block *super_copy = fs_info->super_copy;
- struct btrfs_fs_devices *fs_devices;
u64 old_total;
u64 diff;
@@ -2871,8 +2651,6 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
return -EINVAL;
}
- fs_devices = fs_info->fs_devices;
-
btrfs_set_super_total_bytes(super_copy,
round_down(old_total + diff, fs_info->sectorsize));
device->fs_devices->total_rw_bytes += diff;
@@ -2880,9 +2658,9 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
btrfs_device_set_total_bytes(device, new_size);
btrfs_device_set_disk_total_bytes(device, new_size);
btrfs_clear_space_info_full(device->fs_info);
- if (list_empty(&device->resized_list))
- list_add_tail(&device->resized_list,
- &fs_devices->resized_devices);
+ if (list_empty(&device->post_commit_list))
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
mutex_unlock(&fs_info->chunk_mutex);
return btrfs_update_device(trans, device);
@@ -2983,7 +2761,7 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree;
struct extent_map *em;
- em_tree = &fs_info->mapping_tree.map_tree;
+ em_tree = &fs_info->mapping_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, length);
read_unlock(&em_tree->lock);
@@ -3113,10 +2891,6 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
*/
lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
- ret = btrfs_can_relocate(fs_info, chunk_offset);
- if (ret)
- return -ENOSPC;
-
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
ret = btrfs_relocate_block_group(fs_info, chunk_offset);
@@ -3124,16 +2898,6 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (ret)
return ret;
- /*
- * We add the kobjects here (and after forcing data chunk creation)
- * since relocation is the only place we'll create chunks of a new
- * type at runtime. The only place where we'll remove the last
- * chunk of a type is the call immediately below this one. Even
- * so, we're protected against races with the cleaner thread since
- * we're covered by the delete_unused_bgs_mutex.
- */
- btrfs_add_raid_kobjects(fs_info);
-
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
@@ -3233,7 +2997,7 @@ error:
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
u64 chunk_offset)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
u64 bytes_used;
u64 chunk_type;
@@ -3242,30 +3006,28 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
chunk_type = cache->flags;
btrfs_put_block_group(cache);
- if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
- spin_lock(&fs_info->data_sinfo->lock);
- bytes_used = fs_info->data_sinfo->bytes_used;
- spin_unlock(&fs_info->data_sinfo->lock);
-
- if (!bytes_used) {
- struct btrfs_trans_handle *trans;
- int ret;
+ if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
+ return 0;
- trans = btrfs_join_transaction(fs_info->tree_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ spin_lock(&fs_info->data_sinfo->lock);
+ bytes_used = fs_info->data_sinfo->bytes_used;
+ spin_unlock(&fs_info->data_sinfo->lock);
- ret = btrfs_force_chunk_alloc(trans,
- BTRFS_BLOCK_GROUP_DATA);
- btrfs_end_transaction(trans);
- if (ret < 0)
- return ret;
+ if (!bytes_used) {
+ struct btrfs_trans_handle *trans;
+ int ret;
- btrfs_add_raid_kobjects(fs_info);
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- return 1;
- }
+ ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
+ btrfs_end_transaction(trans);
+ if (ret < 0)
+ return ret;
+ return 1;
}
+
return 0;
}
@@ -3444,28 +3206,28 @@ static int chunk_profiles_filter(u64 chunk_type,
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
u64 chunk_used;
u64 user_thresh_min;
u64 user_thresh_max;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
- chunk_used = btrfs_block_group_used(&cache->item);
+ chunk_used = cache->used;
if (bargs->usage_min == 0)
user_thresh_min = 0;
else
- user_thresh_min = div_factor_fine(cache->key.offset,
- bargs->usage_min);
+ user_thresh_min = div_factor_fine(cache->length,
+ bargs->usage_min);
if (bargs->usage_max == 0)
user_thresh_max = 1;
else if (bargs->usage_max > 100)
- user_thresh_max = cache->key.offset;
+ user_thresh_max = cache->length;
else
- user_thresh_max = div_factor_fine(cache->key.offset,
- bargs->usage_max);
+ user_thresh_max = div_factor_fine(cache->length,
+ bargs->usage_max);
if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
ret = 0;
@@ -3477,20 +3239,19 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
u64 chunk_offset, struct btrfs_balance_args *bargs)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
u64 chunk_used, user_thresh;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
- chunk_used = btrfs_block_group_used(&cache->item);
+ chunk_used = cache->used;
if (bargs->usage_min == 0)
user_thresh = 1;
else if (bargs->usage > 100)
- user_thresh = cache->key.offset;
+ user_thresh = cache->length;
else
- user_thresh = div_factor_fine(cache->key.offset,
- bargs->usage);
+ user_thresh = div_factor_fine(cache->length, bargs->usage);
if (chunk_used < user_thresh)
ret = 0;
@@ -3516,6 +3277,18 @@ static int chunk_devid_filter(struct extent_buffer *leaf,
return 1;
}
+static u64 calc_data_stripes(u64 type, int num_stripes)
+{
+ const int index = btrfs_bg_flags_to_raid_index(type);
+ const int ncopies = btrfs_raid_array[index].ncopies;
+ const int nparity = btrfs_raid_array[index].nparity;
+
+ if (nparity)
+ return num_stripes - nparity;
+ else
+ return num_stripes / ncopies;
+}
+
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
@@ -3525,22 +3298,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
u64 stripe_offset;
u64 stripe_length;
+ u64 type;
int factor;
int i;
if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
return 0;
- if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
- factor = num_stripes / 2;
- } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
- factor = num_stripes - 1;
- } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
- factor = num_stripes - 2;
- } else {
- factor = num_stripes;
- }
+ type = btrfs_chunk_type(leaf, chunk);
+ factor = calc_data_stripes(type, num_stripes);
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
@@ -3601,10 +3367,10 @@ static int chunk_soft_convert_filter(u64 chunk_type,
return 0;
}
-static int should_balance_chunk(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf,
+static int should_balance_chunk(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 chunk_offset)
{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
struct btrfs_balance_args *bargs = NULL;
u64 chunk_type = btrfs_chunk_type(leaf, chunk);
@@ -3784,8 +3550,7 @@ again:
spin_unlock(&fs_info->balance_lock);
}
- ret = should_balance_chunk(fs_info, leaf, chunk,
- found_key.offset);
+ ret = should_balance_chunk(leaf, chunk, found_key.offset);
btrfs_release_path(path);
if (!ret) {
@@ -3899,8 +3664,7 @@ static int alloc_profile_is_valid(u64 flags, int extended)
if (flags == 0)
return !extended; /* "0" is valid for usual profiles */
- /* true if exactly one bit set */
- return is_power_of_2(flags);
+ return has_single_bit_set(flags);
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
@@ -3964,11 +3728,9 @@ static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
bp += ret; \
} while (0)
- if (flags & BTRFS_BALANCE_ARGS_CONVERT) {
- int index = btrfs_bg_flags_to_raid_index(bargs->target);
-
- CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index));
- }
+ if (flags & BTRFS_BALANCE_ARGS_CONVERT)
+ CHECK_APPEND_1ARG("convert=%s,",
+ btrfs_bg_type_to_raid_name(bargs->target));
if (flags & BTRFS_BALANCE_ARGS_SOFT)
CHECK_APPEND_NOARG("soft,");
@@ -4089,7 +3851,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
int ret;
u64 num_devices;
unsigned seq;
- bool reducing_integrity;
+ bool reducing_redundancy;
+ int i;
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
@@ -4118,49 +3881,54 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
}
}
- num_devices = btrfs_num_devices(fs_info);
+ /*
+ * rw_devices will not change at the moment, device add/delete/replace
+ * are excluded by EXCL_OP
+ */
+ num_devices = fs_info->fs_devices->rw_devices;
- allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
- if (num_devices > 1)
- allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
- if (num_devices > 2)
- allowed |= BTRFS_BLOCK_GROUP_RAID5;
- if (num_devices > 3)
- allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID6);
- if (validate_convert_profile(&bctl->data, allowed)) {
- int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
+ /*
+ * SINGLE profile on-disk has no profile bit, but in-memory we have a
+ * special bit for it, to make it easier to distinguish. Thus we need
+ * to set it manually, or balance would refuse the profile.
+ */
+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
+ if (num_devices >= btrfs_raid_array[i].devs_min)
+ allowed |= btrfs_raid_array[i].bg_flag;
+ if (validate_convert_profile(&bctl->data, allowed)) {
btrfs_err(fs_info,
"balance: invalid convert data profile %s",
- get_raid_name(index));
+ btrfs_bg_type_to_raid_name(bctl->data.target));
ret = -EINVAL;
goto out;
}
if (validate_convert_profile(&bctl->meta, allowed)) {
- int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
-
btrfs_err(fs_info,
"balance: invalid convert metadata profile %s",
- get_raid_name(index));
+ btrfs_bg_type_to_raid_name(bctl->meta.target));
ret = -EINVAL;
goto out;
}
if (validate_convert_profile(&bctl->sys, allowed)) {
- int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
-
btrfs_err(fs_info,
"balance: invalid convert system profile %s",
- get_raid_name(index));
+ btrfs_bg_type_to_raid_name(bctl->sys.target));
ret = -EINVAL;
goto out;
}
- /* allow to reduce meta or sys integrity only if force set */
- allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6;
+ /*
+ * Allow to reduce metadata or system integrity only if force set for
+ * profiles with redundancy (copies, parity)
+ */
+ allowed = 0;
+ for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
+ if (btrfs_raid_array[i].ncopies >= 2 ||
+ btrfs_raid_array[i].tolerated_failures >= 1)
+ allowed |= btrfs_raid_array[i].bg_flag;
+ }
do {
seq = read_seqbegin(&fs_info->profiles_lock);
@@ -4170,9 +3938,9 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(fs_info->avail_metadata_alloc_bits & allowed) &&
!(bctl->meta.target & allowed)))
- reducing_integrity = true;
+ reducing_redundancy = true;
else
- reducing_integrity = false;
+ reducing_redundancy = false;
/* if we're not converting, the target field is uninitialized */
meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
@@ -4181,13 +3949,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
bctl->data.target : fs_info->avail_data_alloc_bits;
} while (read_seqretry(&fs_info->profiles_lock, seq));
- if (reducing_integrity) {
+ if (reducing_redundancy) {
if (bctl->flags & BTRFS_BALANCE_FORCE) {
btrfs_info(fs_info,
- "balance: force reducing metadata integrity");
+ "balance: force reducing metadata redundancy");
} else {
btrfs_err(fs_info,
- "balance: reduces metadata integrity, use --force if you want this");
+ "balance: reduces metadata redundancy, use --force if you want this");
ret = -EINVAL;
goto out;
}
@@ -4195,12 +3963,18 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
- int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
- int data_index = btrfs_bg_flags_to_raid_index(data_target);
-
btrfs_warn(fs_info,
"balance: metadata profile %s has lower redundancy than data profile %s",
- get_raid_name(meta_index), get_raid_name(data_index));
+ btrfs_bg_type_to_raid_name(meta_target),
+ btrfs_bg_type_to_raid_name(data_target));
+ }
+
+ if (fs_info->send_in_progress) {
+ btrfs_warn_rl(fs_info,
+"cannot run balance while send operations are in progress (%d in progress)",
+ fs_info->send_in_progress);
+ ret = -EAGAIN;
+ goto out;
}
ret = insert_balance_item(fs_info, bctl);
@@ -4661,8 +4435,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
if (IS_ERR(trans))
return PTR_ERR(trans);
- uuid_root = btrfs_create_tree(trans, fs_info,
- BTRFS_UUID_TREE_OBJECTID);
+ uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
if (IS_ERR(uuid_root)) {
ret = PTR_ERR(uuid_root);
btrfs_abort_transaction(trans, ret);
@@ -4722,15 +4495,16 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
int slot;
int failed = 0;
bool retried = false;
- bool checked_pending_chunks = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 old_size = btrfs_device_get_total_bytes(device);
u64 diff;
+ u64 start;
new_size = round_down(new_size, fs_info->sectorsize);
+ start = new_size;
diff = round_down(old_size - new_size, fs_info->sectorsize);
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
@@ -4742,6 +4516,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
path->reada = READA_BACK;
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, new_size);
@@ -4749,7 +4529,21 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
device->fs_devices->total_rw_bytes -= diff;
atomic64_sub(diff, &fs_info->free_chunk_space);
}
- mutex_unlock(&fs_info->chunk_mutex);
+
+ /*
+ * Once the device's size has been set to the new size, ensure all
+ * in-memory chunks are synced to disk so that the loop below sees them
+ * and relocates them accordingly.
+ */
+ if (contains_pending_extent(device, &start, diff)) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ goto done;
+ } else {
+ mutex_unlock(&fs_info->chunk_mutex);
+ btrfs_end_transaction(trans);
+ }
again:
key.objectid = device->devid;
@@ -4840,40 +4634,10 @@ again:
}
mutex_lock(&fs_info->chunk_mutex);
-
- /*
- * We checked in the above loop all device extents that were already in
- * the device tree. However before we have updated the device's
- * total_bytes to the new size, we might have had chunk allocations that
- * have not complete yet (new block groups attached to transaction
- * handles), and therefore their device extents were not yet in the
- * device tree and we missed them in the loop above. So if we have any
- * pending chunk using a device extent that overlaps the device range
- * that we can not use anymore, commit the current transaction and
- * repeat the search on the device tree - this way we guarantee we will
- * not have chunks using device extents that end beyond 'new_size'.
- */
- if (!checked_pending_chunks) {
- u64 start = new_size;
- u64 len = old_size - new_size;
-
- if (contains_pending_extent(trans->transaction, device,
- &start, len)) {
- mutex_unlock(&fs_info->chunk_mutex);
- checked_pending_chunks = true;
- failed = 0;
- retried = false;
- ret = btrfs_commit_transaction(trans);
- if (ret)
- goto done;
- goto again;
- }
- }
-
btrfs_device_set_disk_total_bytes(device, new_size);
- if (list_empty(&device->resized_list))
- list_add_tail(&device->resized_list,
- &fs_info->fs_devices->resized_devices);
+ if (list_empty(&device->post_commit_list))
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
WARN_ON(diff > old_total);
btrfs_set_super_total_bytes(super_copy,
@@ -4957,14 +4721,13 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID56);
}
-#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \
- - sizeof(struct btrfs_chunk)) \
- / sizeof(struct btrfs_stripe) + 1)
+static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
+{
+ if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
+ return;
-#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
- - 2 * sizeof(struct btrfs_disk_key) \
- - 2 * sizeof(struct btrfs_chunk)) \
- / sizeof(struct btrfs_stripe) + 1)
+ btrfs_set_fs_incompat(info, RAID1C34);
+}
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
u64 start, u64 type)
@@ -5011,6 +4774,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
sub_stripes = btrfs_raid_array[index].sub_stripes;
dev_stripes = btrfs_raid_array[index].dev_stripes;
devs_max = btrfs_raid_array[index].devs_max;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS(info);
devs_min = btrfs_raid_array[index].devs_min;
devs_increment = btrfs_raid_array[index].devs_increment;
ncopies = btrfs_raid_array[index].ncopies;
@@ -5019,8 +4784,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = SZ_1G;
max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
- if (!devs_max)
- devs_max = BTRFS_MAX_DEVS(info);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
@@ -5028,17 +4791,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
else
max_stripe_size = SZ_256M;
max_chunk_size = max_stripe_size;
- if (!devs_max)
- devs_max = BTRFS_MAX_DEVS(info);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
- if (!devs_max)
- devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
+ devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
} else {
btrfs_err(info, "invalid chunk type 0x%llx requested",
type);
- BUG_ON(1);
+ BUG();
}
/* We don't want a chunk larger than 10% of writable space */
@@ -5079,7 +4839,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (total_avail == 0)
continue;
- ret = find_free_dev_extent(trans, device,
+ ret = find_free_dev_extent(device,
max_stripe_size * dev_stripes,
&dev_offset, &max_avail);
if (ret && ret != -ENOSPC)
@@ -5115,8 +4875,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
btrfs_cmp_device_info, NULL);
- /* round down to number of usable stripes */
- ndevs = round_down(ndevs, devs_increment);
+ /*
+ * Round down to number of usable stripes, devs_increment can be any
+ * number so we can't use round_down()
+ */
+ ndevs -= ndevs % devs_increment;
if (ndevs < devs_min) {
ret = -ENOSPC;
@@ -5205,7 +4968,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em->block_len = em->len;
em->orig_block_len = stripe_size;
- em_tree = &info->mapping_tree.map_tree;
+ em_tree = &info->mapping_tree;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
if (ret) {
@@ -5213,23 +4976,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
free_extent_map(em);
goto error;
}
-
- list_add_tail(&em->list, &trans->transaction->pending_chunks);
- refcount_inc(&em->refs);
write_unlock(&em_tree->lock);
ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
if (ret)
goto error_del_extent;
- for (i = 0; i < map->num_stripes; i++)
- btrfs_device_set_bytes_used(map->stripes[i].dev,
- map->stripes[i].dev->bytes_used + stripe_size);
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *dev = map->stripes[i].dev;
+
+ btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size);
+ if (list_empty(&dev->post_commit_list))
+ list_add_tail(&dev->post_commit_list,
+ &trans->transaction->dev_update_list);
+ }
atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
free_extent_map(em);
check_raid56_incompat_flag(info, type);
+ check_raid1c34_incompat_flag(info, type);
kfree(devices_info);
return 0;
@@ -5243,8 +5009,6 @@ error_del_extent:
free_extent_map(em);
/* One for the tree reference */
free_extent_map(em);
- /* One for the pending_chunks list reference */
- free_extent_map(em);
error:
kfree(devices_info);
return ret;
@@ -5364,9 +5128,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
return __btrfs_alloc_chunk(trans, chunk_offset, type);
}
-static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
u64 chunk_offset;
u64 sys_chunk_offset;
u64 alloc_profile;
@@ -5386,20 +5150,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
{
- int max_errors;
-
- if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_DUP)) {
- max_errors = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
- max_errors = 2;
- } else {
- max_errors = 0;
- }
+ const int index = btrfs_bg_flags_to_raid_index(map->type);
- return max_errors;
+ return btrfs_raid_array[index].tolerated_failures;
}
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
@@ -5440,21 +5193,16 @@ end:
return readonly;
}
-void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
-{
- extent_map_tree_init(&tree->map_tree);
-}
-
-void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+void btrfs_mapping_tree_free(struct extent_map_tree *tree)
{
struct extent_map *em;
while (1) {
- write_lock(&tree->map_tree.lock);
- em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+ write_lock(&tree->lock);
+ em = lookup_extent_mapping(tree, 0, (u64)-1);
if (em)
- remove_extent_mapping(&tree->map_tree, em);
- write_unlock(&tree->map_tree.lock);
+ remove_extent_mapping(tree, em);
+ write_unlock(&tree->lock);
if (!em)
break;
/* once for us */
@@ -5481,7 +5229,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return 1;
map = em->map_lookup;
- if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+ if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
ret = map->sub_stripes;
@@ -5555,7 +5303,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev;
ASSERT((map->type &
- (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));
+ (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = map->sub_stripes;
@@ -5666,12 +5414,13 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
* replace.
*/
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length,
+ u64 logical, u64 *length_ret,
struct btrfs_bio **bbio_ret)
{
struct extent_map *em;
struct map_lookup *map;
struct btrfs_bio *bbio;
+ u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
u64 stripe_nr_end;
@@ -5704,7 +5453,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
}
offset = logical - em->start;
- length = min_t(u64, em->len - offset, length);
+ length = min_t(u64, em->start + em->len - logical, length);
+ *length_ret = length;
stripe_len = map->stripe_len;
/*
@@ -5744,7 +5494,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
&remaining_stripes);
div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
last_stripe *= sub_stripes;
- } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_DUP)) {
num_stripes = map->num_stripes;
} else {
@@ -5988,6 +5738,106 @@ static bool need_full_stripe(enum btrfs_map_op op)
return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
}
+/*
+ * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
+ * tuple. This information is used to calculate how big a
+ * particular bio can get before it straddles a stripe.
+ *
+ * @fs_info - the filesystem
+ * @logical - address that we want to figure out the geometry of
+ * @len - the length of IO we are going to perform, starting at @logical
+ * @op - type of operation - write or read
+ * @io_geom - pointer used to return values
+ *
+ * Returns < 0 in case a chunk for the given logical address cannot be found,
+ * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
+ */
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 offset;
+ u64 stripe_offset;
+ u64 stripe_nr;
+ u64 stripe_len;
+ u64 raid56_full_stripe_start = (u64)-1;
+ int data_stripes;
+ int ret = 0;
+
+ ASSERT(op != BTRFS_MAP_DISCARD);
+
+ em = btrfs_get_chunk_map(fs_info, logical, len);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+ /* Offset of this logical address in the chunk */
+ offset = logical - em->start;
+ /* Len of a stripe in a chunk */
+ stripe_len = map->stripe_len;
+ /* Stripe wher this block falls in */
+ stripe_nr = div64_u64(offset, stripe_len);
+ /* Offset of stripe in the chunk */
+ stripe_offset = stripe_nr * stripe_len;
+ if (offset < stripe_offset) {
+ btrfs_crit(fs_info,
+"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
+ stripe_offset, offset, em->start, logical, stripe_len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* stripe_offset is the offset of this block in its stripe */
+ stripe_offset = offset - stripe_offset;
+ data_stripes = nr_data_stripes(map);
+
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ u64 max_len = stripe_len - stripe_offset;
+
+ /*
+ * In case of raid56, we need to know the stripe aligned start
+ */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ unsigned long full_stripe_len = stripe_len * data_stripes;
+ raid56_full_stripe_start = offset;
+
+ /*
+ * Allow a write of a full stripe, but make sure we
+ * don't allow straddling of stripes
+ */
+ raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
+ full_stripe_len);
+ raid56_full_stripe_start *= full_stripe_len;
+
+ /*
+ * For writes to RAID[56], allow a full stripeset across
+ * all disks. For other RAID types and for RAID[56]
+ * reads, just allow a single stripe (on a single disk).
+ */
+ if (op == BTRFS_MAP_WRITE) {
+ max_len = stripe_len * data_stripes -
+ (offset - raid56_full_stripe_start);
+ }
+ }
+ len = min_t(u64, em->len - offset, max_len);
+ } else {
+ len = em->len - offset;
+ }
+
+ io_geom->len = len;
+ io_geom->offset = offset;
+ io_geom->stripe_len = stripe_len;
+ io_geom->stripe_nr = stripe_nr;
+ io_geom->stripe_offset = stripe_offset;
+ io_geom->raid56_stripe_offset = raid56_full_stripe_start;
+
+out:
+ /* once for us */
+ free_extent_map(em);
+ return ret;
+}
+
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
@@ -5996,11 +5846,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
{
struct extent_map *em;
struct map_lookup *map;
- u64 offset;
u64 stripe_offset;
u64 stripe_nr;
u64 stripe_len;
u32 stripe_index;
+ int data_stripes;
int i;
int ret = 0;
int num_stripes;
@@ -6013,76 +5863,28 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int patch_the_first_stripe_for_dev_replace = 0;
u64 physical_to_patch_in_first_stripe = 0;
u64 raid56_full_stripe_start = (u64)-1;
+ struct btrfs_io_geometry geom;
+
+ ASSERT(bbio_ret);
if (op == BTRFS_MAP_DISCARD)
return __btrfs_map_block_for_discard(fs_info, logical,
- *length, bbio_ret);
+ length, bbio_ret);
- em = btrfs_get_chunk_map(fs_info, logical, *length);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
+ if (ret < 0)
+ return ret;
+ em = btrfs_get_chunk_map(fs_info, logical, *length);
+ ASSERT(!IS_ERR(em));
map = em->map_lookup;
- offset = logical - em->start;
-
- stripe_len = map->stripe_len;
- stripe_nr = offset;
- /*
- * stripe_nr counts the total number of stripes we have to stride
- * to get to this block
- */
- stripe_nr = div64_u64(stripe_nr, stripe_len);
-
- stripe_offset = stripe_nr * stripe_len;
- if (offset < stripe_offset) {
- btrfs_crit(fs_info,
- "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
- stripe_offset, offset, em->start, logical,
- stripe_len);
- free_extent_map(em);
- return -EINVAL;
- }
-
- /* stripe_offset is the offset of this block in its stripe*/
- stripe_offset = offset - stripe_offset;
-
- /* if we're here for raid56, we need to know the stripe aligned start */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
- raid56_full_stripe_start = offset;
-
- /* allow a write of a full stripe, but make sure we don't
- * allow straddling of stripes
- */
- raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
- full_stripe_len);
- raid56_full_stripe_start *= full_stripe_len;
- }
- if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
- u64 max_len;
- /* For writes to RAID[56], allow a full stripeset across all disks.
- For other RAID types and for RAID[56] reads, just allow a single
- stripe (on a single disk). */
- if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
- (op == BTRFS_MAP_WRITE)) {
- max_len = stripe_len * nr_data_stripes(map) -
- (offset - raid56_full_stripe_start);
- } else {
- /* we limit the length of each bio to what fits in a stripe */
- max_len = stripe_len - stripe_offset;
- }
- *length = min_t(u64, em->len - offset, max_len);
- } else {
- *length = em->len - offset;
- }
-
- /*
- * This is for when we're called from btrfs_bio_fits_in_stripe and all
- * it cares about is the length
- */
- if (!bbio_ret)
- goto out;
+ *length = geom.len;
+ stripe_len = geom.stripe_len;
+ stripe_nr = geom.stripe_nr;
+ stripe_offset = geom.stripe_offset;
+ raid56_full_stripe_start = geom.raid56_stripe_offset;
+ data_stripes = nr_data_stripes(map);
down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
@@ -6114,7 +5916,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
&stripe_index);
if (!need_full_stripe(op))
mirror_num = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
if (need_full_stripe(op))
num_stripes = map->num_stripes;
else if (mirror_num)
@@ -6156,7 +5958,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div64_u64(raid56_full_stripe_start,
- stripe_len * nr_data_stripes(map));
+ stripe_len * data_stripes);
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
@@ -6172,10 +5974,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* Mirror #3 is RAID6 Q block.
*/
stripe_nr = div_u64_rem(stripe_nr,
- nr_data_stripes(map), &stripe_index);
+ data_stripes, &stripe_index);
if (mirror_num > 1)
- stripe_index = nr_data_stripes(map) +
- mirror_num - 2;
+ stripe_index = data_stripes + mirror_num - 2;
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
@@ -6233,8 +6034,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
div_u64_rem(stripe_nr, num_stripes, &rot);
/* Fill in the logical address of each stripe */
- tmp = stripe_nr * nr_data_stripes(map);
- for (i = 0; i < nr_data_stripes(map); i++)
+ tmp = stripe_nr * data_stripes;
+ for (i = 0; i < data_stripes; i++)
bbio->raid_map[(i+rot) % num_stripes] =
em->start + (tmp + i) * map->stripe_len;
@@ -6448,52 +6249,8 @@ static void btrfs_end_bio(struct bio *bio)
}
}
-/*
- * see run_scheduled_bios for a description of why bios are collected for
- * async submit.
- *
- * This will add one bio to the pending list for a device and make sure
- * the work struct is scheduled.
- */
-static noinline void btrfs_schedule_bio(struct btrfs_device *device,
- struct bio *bio)
-{
- struct btrfs_fs_info *fs_info = device->fs_info;
- int should_queue = 1;
- struct btrfs_pending_bios *pending_bios;
-
- /* don't bother with additional async steps for reads, right now */
- if (bio_op(bio) == REQ_OP_READ) {
- btrfsic_submit_bio(bio);
- return;
- }
-
- WARN_ON(bio->bi_next);
- bio->bi_next = NULL;
-
- spin_lock(&device->io_lock);
- if (op_is_sync(bio->bi_opf))
- pending_bios = &device->pending_sync_bios;
- else
- pending_bios = &device->pending_bios;
-
- if (pending_bios->tail)
- pending_bios->tail->bi_next = bio;
-
- pending_bios->tail = bio;
- if (!pending_bios->head)
- pending_bios->head = bio;
- if (device->running_pending)
- should_queue = 0;
-
- spin_unlock(&device->io_lock);
-
- if (should_queue)
- btrfs_queue_work(fs_info->submit_workers, &device->work);
-}
-
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
- u64 physical, int dev_nr, int async)
+ u64 physical, int dev_nr)
{
struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
struct btrfs_fs_info *fs_info = bbio->fs_info;
@@ -6511,10 +6268,7 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
btrfs_bio_counter_inc_noblocked(fs_info);
- if (async)
- btrfs_schedule_bio(dev, bio);
- else
- btrfsic_submit_bio(bio);
+ btrfsic_submit_bio(bio);
}
static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
@@ -6535,7 +6289,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
}
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num, int async_submit)
+ int mirror_num)
{
struct btrfs_device *dev;
struct bio *first_bio = bio;
@@ -6604,7 +6358,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
bio = first_bio;
submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
- dev_nr, async_submit);
+ dev_nr);
}
btrfs_bio_counter_dec(fs_info);
return BLK_STS_OK;
@@ -6708,105 +6462,9 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
else
generate_random_uuid(dev->uuid);
- btrfs_init_work(&dev->work, btrfs_submit_helper,
- pending_bios_fn, NULL, NULL);
-
return dev;
}
-/* Return -EIO if any error, otherwise return 0. */
-static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf,
- struct btrfs_chunk *chunk, u64 logical)
-{
- u64 length;
- u64 stripe_len;
- u16 num_stripes;
- u16 sub_stripes;
- u64 type;
- u64 features;
- bool mixed = false;
-
- length = btrfs_chunk_length(leaf, chunk);
- stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
- num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
- sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
- type = btrfs_chunk_type(leaf, chunk);
-
- if (!num_stripes) {
- btrfs_err(fs_info, "invalid chunk num_stripes: %u",
- num_stripes);
- return -EIO;
- }
- if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
- btrfs_err(fs_info, "invalid chunk logical %llu", logical);
- return -EIO;
- }
- if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
- btrfs_err(fs_info, "invalid chunk sectorsize %u",
- btrfs_chunk_sector_size(leaf, chunk));
- return -EIO;
- }
- if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
- btrfs_err(fs_info, "invalid chunk length %llu", length);
- return -EIO;
- }
- if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
- btrfs_err(fs_info, "invalid chunk stripe length: %llu",
- stripe_len);
- return -EIO;
- }
- if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
- type) {
- btrfs_err(fs_info, "unrecognized chunk type: %llu",
- ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
- BTRFS_BLOCK_GROUP_PROFILE_MASK) &
- btrfs_chunk_type(leaf, chunk));
- return -EIO;
- }
-
- if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
- btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type);
- return -EIO;
- }
-
- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
- (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
- btrfs_err(fs_info,
- "system chunk with data or metadata type: 0x%llx", type);
- return -EIO;
- }
-
- features = btrfs_super_incompat_flags(fs_info->super_copy);
- if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
- mixed = true;
-
- if (!mixed) {
- if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
- (type & BTRFS_BLOCK_GROUP_DATA)) {
- btrfs_err(fs_info,
- "mixed chunk type in non-mixed mode: 0x%llx", type);
- return -EIO;
- }
- }
-
- if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
- (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
- ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
- num_stripes != 1)) {
- btrfs_err(fs_info,
- "invalid num_stripes:sub_stripes %u:%u for profile %llu",
- num_stripes, sub_stripes,
- type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
- return -EIO;
- }
-
- return 0;
-}
-
static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
u64 devid, u8 *uuid, bool error)
{
@@ -6818,11 +6476,31 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
devid, uuid);
}
-static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
- struct extent_buffer *leaf,
+static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
+{
+ int index = btrfs_bg_flags_to_raid_index(type);
+ int ncopies = btrfs_raid_array[index].ncopies;
+ int data_stripes;
+
+ switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case BTRFS_BLOCK_GROUP_RAID5:
+ data_stripes = num_stripes - 1;
+ break;
+ case BTRFS_BLOCK_GROUP_RAID6:
+ data_stripes = num_stripes - 2;
+ break;
+ default:
+ data_stripes = num_stripes / ncopies;
+ break;
+ }
+ return div_u64(chunk_len, data_stripes);
+}
+
+static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
struct extent_map *em;
u64 logical;
@@ -6837,13 +6515,19 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
length = btrfs_chunk_length(leaf, chunk);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
- ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
- if (ret)
- return ret;
+ /*
+ * Only need to verify chunk item if we're reading from sys chunk array,
+ * as chunk item in tree block is already verified by tree-checker.
+ */
+ if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
+ ret = btrfs_check_chunk_valid(leaf, chunk, logical);
+ if (ret)
+ return ret;
+ }
- read_lock(&map_tree->map_tree.lock);
- em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
- read_unlock(&map_tree->map_tree.lock);
+ read_lock(&map_tree->lock);
+ em = lookup_extent_mapping(map_tree, logical, 1);
+ read_unlock(&map_tree->lock);
/* already mapped? */
if (em && em->start <= logical && em->start + em->len > logical) {
@@ -6877,6 +6561,8 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
map->type = btrfs_chunk_type(leaf, chunk);
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
map->verified_stripes = 0;
+ em->orig_block_len = calc_stripe_length(map->type, em->len,
+ map->num_stripes);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -6910,9 +6596,9 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
}
- write_lock(&map_tree->map_tree.lock);
- ret = add_extent_mapping(&map_tree->map_tree, em, 0);
- write_unlock(&map_tree->map_tree.lock);
+ write_lock(&map_tree->lock);
+ ret = add_extent_mapping(map_tree, em, 0);
+ write_unlock(&map_tree->lock);
if (ret < 0) {
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
@@ -6972,7 +6658,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
if (IS_ERR(fs_devices))
return fs_devices;
- fs_devices->seeding = 1;
+ fs_devices->seeding = true;
fs_devices->opened = 1;
return fs_devices;
}
@@ -7001,10 +6687,10 @@ out:
return fs_devices;
}
-static int read_one_dev(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf,
+static int read_one_dev(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 devid;
@@ -7161,48 +6847,49 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
sb_array_offset += len;
cur_offset += len;
- if (key.type == BTRFS_CHUNK_ITEM_KEY) {
- chunk = (struct btrfs_chunk *)sb_array_offset;
- /*
- * At least one btrfs_chunk with one stripe must be
- * present, exact stripe count check comes afterwards
- */
- len = btrfs_chunk_item_size(1);
- if (cur_offset + len > array_size)
- goto out_short_read;
-
- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
- if (!num_stripes) {
- btrfs_err(fs_info,
- "invalid number of stripes %u in sys_array at offset %u",
- num_stripes, cur_offset);
- ret = -EIO;
- break;
- }
+ if (key.type != BTRFS_CHUNK_ITEM_KEY) {
+ btrfs_err(fs_info,
+ "unexpected item type %u in sys_array at offset %u",
+ (u32)key.type, cur_offset);
+ ret = -EIO;
+ break;
+ }
- type = btrfs_chunk_type(sb, chunk);
- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
- btrfs_err(fs_info,
- "invalid chunk type %llu in sys_array at offset %u",
- type, cur_offset);
- ret = -EIO;
- break;
- }
+ chunk = (struct btrfs_chunk *)sb_array_offset;
+ /*
+ * At least one btrfs_chunk with one stripe must be present,
+ * exact stripe count check comes afterwards
+ */
+ len = btrfs_chunk_item_size(1);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
- len = btrfs_chunk_item_size(num_stripes);
- if (cur_offset + len > array_size)
- goto out_short_read;
+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ if (!num_stripes) {
+ btrfs_err(fs_info,
+ "invalid number of stripes %u in sys_array at offset %u",
+ num_stripes, cur_offset);
+ ret = -EIO;
+ break;
+ }
- ret = read_one_chunk(fs_info, &key, sb, chunk);
- if (ret)
- break;
- } else {
+ type = btrfs_chunk_type(sb, chunk);
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
btrfs_err(fs_info,
- "unexpected item type %u in sys_array at offset %u",
- (u32)key.type, cur_offset);
+ "invalid chunk type %llu in sys_array at offset %u",
+ type, cur_offset);
ret = -EIO;
break;
}
+
+ len = btrfs_chunk_item_size(num_stripes);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ ret = read_one_chunk(&key, sb, chunk);
+ if (ret)
+ break;
+
array_ptr += len;
sb_array_offset += len;
cur_offset += len;
@@ -7230,14 +6917,14 @@ out_short_read:
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev)
{
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
u64 next_start = 0;
bool ret = true;
- read_lock(&map_tree->map_tree.lock);
- em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
- read_unlock(&map_tree->map_tree.lock);
+ read_lock(&map_tree->lock);
+ em = lookup_extent_mapping(map_tree, 0, (u64)-1);
+ read_unlock(&map_tree->lock);
/* No chunk at all? Return false anyway */
if (!em) {
ret = false;
@@ -7275,10 +6962,10 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
next_start = extent_map_end(em);
free_extent_map(em);
- read_lock(&map_tree->map_tree.lock);
- em = lookup_extent_mapping(&map_tree->map_tree, next_start,
+ read_lock(&map_tree->lock);
+ em = lookup_extent_mapping(map_tree, next_start,
(u64)(-1) - next_start);
- read_unlock(&map_tree->map_tree.lock);
+ read_unlock(&map_tree->lock);
}
out:
return ret;
@@ -7334,14 +7021,14 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
struct btrfs_dev_item *dev_item;
dev_item = btrfs_item_ptr(leaf, slot,
struct btrfs_dev_item);
- ret = read_one_dev(fs_info, leaf, dev_item);
+ ret = read_one_dev(leaf, dev_item);
if (ret)
goto error;
total_dev++;
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
- ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
+ ret = read_one_chunk(&found_key, leaf, chunk);
if (ret)
goto error;
}
@@ -7393,18 +7080,32 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
}
}
-static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
+static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
+ const struct btrfs_dev_stats_item *ptr,
+ int index)
{
- int i;
+ u64 val;
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
- btrfs_dev_stat_reset(dev, i);
+ read_extent_buffer(eb, &val,
+ offsetof(struct btrfs_dev_stats_item, values) +
+ ((unsigned long)ptr) + (index * sizeof(u64)),
+ sizeof(val));
+ return val;
+}
+
+static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+ struct btrfs_dev_stats_item *ptr,
+ int index, u64 val)
+{
+ write_extent_buffer(eb, &val,
+ offsetof(struct btrfs_dev_stats_item, values) +
+ ((unsigned long)ptr) + (index * sizeof(u64)),
+ sizeof(val));
}
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
struct btrfs_key key;
- struct btrfs_key found_key;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct extent_buffer *eb;
@@ -7415,10 +7116,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
int i;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
@@ -7430,14 +7129,14 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
key.offset = device->devid;
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
if (ret) {
- __btrfs_reset_dev_stats(device);
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_dev_stat_set(device, i, 0);
device->dev_stats_valid = 1;
btrfs_release_path(path);
continue;
}
slot = path->slots[0];
eb = path->nodes[0];
- btrfs_item_key_to_cpu(eb, &found_key, slot);
item_size = btrfs_item_size_nr(eb, slot);
ptr = btrfs_item_ptr(eb, slot,
@@ -7448,7 +7147,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
btrfs_dev_stat_set(device, i,
btrfs_dev_stats_value(eb, ptr, i));
else
- btrfs_dev_stat_reset(device, i);
+ btrfs_dev_stat_set(device, i, 0);
}
device->dev_stats_valid = 1;
@@ -7457,7 +7156,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
}
mutex_unlock(&fs_devices->device_list_mutex);
-out:
btrfs_free_path(path);
return ret < 0 ? ret : 0;
}
@@ -7530,9 +7228,9 @@ out:
/*
* called from commit_transaction. Writes all changed device stats to disk.
*/
-int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
int stats_cnt;
@@ -7631,7 +7329,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
stats->values[i] =
btrfs_dev_stat_read_and_reset(dev, i);
else
- btrfs_dev_stat_reset(dev, i);
+ btrfs_dev_stat_set(dev, i, 0);
}
} else {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
@@ -7674,51 +7372,34 @@ void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_pat
}
/*
- * Update the size of all devices, which is used for writing out the
- * super blocks.
+ * Update the size and bytes used for each device where it changed. This is
+ * delayed since we would otherwise get errors while writing out the
+ * superblocks.
+ *
+ * Must be invoked during transaction commit.
*/
-void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
+void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *curr, *next;
- if (list_empty(&fs_devices->resized_devices))
- return;
-
- mutex_lock(&fs_devices->device_list_mutex);
- mutex_lock(&fs_info->chunk_mutex);
- list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
- resized_list) {
- list_del_init(&curr->resized_list);
- curr->commit_total_bytes = curr->disk_total_bytes;
- }
- mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_devices->device_list_mutex);
-}
+ ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
-/* Must be invoked during the transaction commit */
-void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct extent_map *em;
- struct map_lookup *map;
- struct btrfs_device *dev;
- int i;
-
- if (list_empty(&trans->pending_chunks))
+ if (list_empty(&trans->dev_update_list))
return;
- /* In order to kick the device replace finish process */
- mutex_lock(&fs_info->chunk_mutex);
- list_for_each_entry(em, &trans->pending_chunks, list) {
- map = em->map_lookup;
-
- for (i = 0; i < map->num_stripes; i++) {
- dev = map->stripes[i].dev;
- dev->commit_bytes_used = dev->bytes_used;
- }
+ /*
+ * We don't need the device_list_mutex here. This list is owned by the
+ * transaction and the transaction must complete before the device is
+ * released.
+ */
+ mutex_lock(&trans->fs_info->chunk_mutex);
+ list_for_each_entry_safe(curr, next, &trans->dev_update_list,
+ post_commit_list) {
+ list_del_init(&curr->post_commit_list);
+ curr->commit_total_bytes = curr->disk_total_bytes;
+ curr->commit_bytes_used = curr->bytes_used;
}
- mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&trans->fs_info->chunk_mutex);
}
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
@@ -7744,38 +7425,18 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
*/
int btrfs_bg_type_to_factor(u64 flags)
{
- if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10))
- return 2;
- return 1;
+ const int index = btrfs_bg_flags_to_raid_index(flags);
+
+ return btrfs_raid_array[index].ncopies;
}
-static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
-{
- int index = btrfs_bg_flags_to_raid_index(type);
- int ncopies = btrfs_raid_array[index].ncopies;
- int data_stripes;
-
- switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
- case BTRFS_BLOCK_GROUP_RAID5:
- data_stripes = num_stripes - 1;
- break;
- case BTRFS_BLOCK_GROUP_RAID6:
- data_stripes = num_stripes - 2;
- break;
- default:
- data_stripes = num_stripes / ncopies;
- break;
- }
- return div_u64(chunk_len, data_stripes);
-}
static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
struct btrfs_device *dev;
@@ -7864,7 +7525,7 @@ out:
static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct rb_node *node;
int ret = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3ad9d58d1b66..fc1b564b9cfe 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -18,9 +18,20 @@ extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN SZ_64K
struct buffer_head;
-struct btrfs_pending_bios {
- struct bio *head;
- struct bio *tail;
+
+struct btrfs_io_geometry {
+ /* remaining bytes before crossing a stripe */
+ u64 len;
+ /* offset of logical address in chunk */
+ u64 offset;
+ /* length of single IO stripe */
+ u64 stripe_len;
+ /* number of stripe where address falls */
+ u64 stripe_nr;
+ /* offset of address in stripe */
+ u64 stripe_offset;
+ /* offset of raid56 stripe into the chunk */
+ u64 raid56_stripe_offset;
};
/*
@@ -43,8 +54,9 @@ struct btrfs_pending_bios {
#define BTRFS_DEV_STATE_FLUSH_SENT (4)
struct btrfs_device {
- struct list_head dev_list;
- struct list_head dev_alloc_list;
+ struct list_head dev_list; /* device_list_mutex */
+ struct list_head dev_alloc_list; /* chunk mutex */
+ struct list_head post_commit_list; /* chunk mutex */
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
@@ -52,13 +64,6 @@ struct btrfs_device {
u64 generation;
- spinlock_t io_lock ____cacheline_aligned;
- int running_pending;
- /* regular prio bios */
- struct btrfs_pending_bios pending_bios;
- /* sync bios */
- struct btrfs_pending_bios pending_sync_bios;
-
struct block_device *bdev;
/* the mode sent to blkdev_get */
@@ -66,7 +71,6 @@ struct btrfs_device {
unsigned long dev_state;
blk_status_t last_flush_error;
- int flush_bio_sent;
#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
seqcount_t data_seqcount;
@@ -102,18 +106,12 @@ struct btrfs_device {
* size of the device on the current transaction
*
* This variant is update when committing the transaction,
- * and protected by device_list_mutex
+ * and protected by chunk mutex
*/
u64 commit_total_bytes;
/* bytes used on the current transaction */
u64 commit_bytes_used;
- /*
- * used to manage the device which is resized
- *
- * It is protected by chunk_lock.
- */
- struct list_head resized_list;
/* for sending down flush barriers */
struct bio *flush_bio;
@@ -123,7 +121,6 @@ struct btrfs_device {
struct scrub_ctx *scrub_ctx;
struct btrfs_work work;
- struct rcu_head rcu;
/* readahead state */
atomic_t reada_in_flight;
@@ -139,6 +136,8 @@ struct btrfs_device {
/* Counter to record the change of device stats */
atomic_t dev_stats_ccnt;
atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
+
+ struct extent_io_tree alloc_state;
};
/*
@@ -233,21 +232,25 @@ struct btrfs_fs_devices {
* this mutex lock.
*/
struct mutex device_list_mutex;
+
+ /* List of all devices, protected by device_list_mutex */
struct list_head devices;
- struct list_head resized_devices;
- /* devices not currently being allocated */
+ /*
+ * Devices which can satisfy space allocation. Protected by
+ * chunk_mutex
+ */
struct list_head alloc_list;
struct btrfs_fs_devices *seed;
- int seeding;
+ bool seeding;
int opened;
/* set when we find or add a device that doesn't have the
* nonrot flag set
*/
- int rotating;
+ bool rotating;
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
@@ -258,6 +261,15 @@ struct btrfs_fs_devices {
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
+#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \
+ - sizeof(struct btrfs_chunk)) \
+ / sizeof(struct btrfs_stripe) + 1)
+
+#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
+ - 2 * sizeof(struct btrfs_disk_key) \
+ - 2 * sizeof(struct btrfs_chunk)) \
+ / sizeof(struct btrfs_stripe) + 1)
+
/*
* we need the mirror number and stripe index to be passed around
* the call chain while we are processing end_io (especially errors).
@@ -307,7 +319,6 @@ struct btrfs_bio {
u64 map_type; /* get from map_lookup->type */
bio_end_io_t *end_io;
struct bio *orig_bio;
- unsigned long flags;
void *private;
atomic_t error;
int max_errors;
@@ -332,16 +343,16 @@ struct btrfs_device_info {
};
struct btrfs_raid_attr {
- int sub_stripes; /* sub_stripes info for map */
- int dev_stripes; /* stripes per dev */
- int devs_max; /* max devs to use */
- int devs_min; /* min devs needed */
- int tolerated_failures; /* max tolerated fail devs */
- int devs_increment; /* ndevs has to be a multiple of this */
- int ncopies; /* how many copies to data has */
- int nparity; /* number of stripes worth of bytes to store
+ u8 sub_stripes; /* sub_stripes info for map */
+ u8 dev_stripes; /* stripes per dev */
+ u8 devs_max; /* max devs to use */
+ u8 devs_min; /* min devs needed */
+ u8 tolerated_failures; /* max tolerated fail devs */
+ u8 devs_increment; /* ndevs has to be a multiple of this */
+ u8 ncopies; /* how many copies to data has */
+ u8 nparity; /* number of stripes worth of bytes to store
* parity information */
- int mindev_error; /* error code if min devs requisite is unmet */
+ u8 mindev_error; /* error code if min devs requisite is unmet */
const char raid_name[8]; /* name of the raid */
u64 bg_flag; /* block group flag of the raid */
};
@@ -390,6 +401,7 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
return BTRFS_MAP_WRITE;
default:
WARN_ON_ONCE(1);
+ /* fall through */
case REQ_OP_READ:
return BTRFS_MAP_READ;
}
@@ -403,15 +415,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret);
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 len, struct btrfs_io_geometry *io_geom);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
-void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
-void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
+void btrfs_mapping_tree_free(struct extent_map_tree *tree);
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num, int async_submit);
+ int mirror_num);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
@@ -449,22 +462,16 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
-int find_free_dev_extent_start(struct btrfs_transaction *transaction,
- struct btrfs_device *device, u64 num_bytes,
- u64 search_start, u64 *start, u64 *max_avail);
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats);
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
-int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
-void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev);
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
@@ -528,12 +535,6 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
atomic_inc(&dev->dev_stats_ccnt);
}
-static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
- int index)
-{
- btrfs_dev_stat_set(dev, index, 0);
-}
-
/*
* Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
* can be used as index to access btrfs_raid_array[].
@@ -544,6 +545,10 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
return BTRFS_RAID_RAID10;
else if (flags & BTRFS_BLOCK_GROUP_RAID1)
return BTRFS_RAID_RAID1;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
+ return BTRFS_RAID_RAID1C3;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
+ return BTRFS_RAID_RAID1C4;
else if (flags & BTRFS_BLOCK_GROUP_DUP)
return BTRFS_RAID_DUP;
else if (flags & BTRFS_BLOCK_GROUP_RAID0)
@@ -556,18 +561,16 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
}
-const char *get_raid_name(enum btrfs_raid_types type);
-
-void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
-void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans);
+void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
-struct list_head *btrfs_get_fs_uuids(void);
+struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev);
int btrfs_bg_type_to_factor(u64 flags);
+const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index f141b45ce349..95d9aebff2c4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -76,9 +76,8 @@ out:
return ret;
}
-static int do_setxattr(struct btrfs_trans_handle *trans,
- struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
+ const char *name, const void *value, size_t size, int flags)
{
struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -87,6 +86,8 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
size_t name_len = strlen(name);
int ret = 0;
+ ASSERT(trans);
+
if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info))
return -ENOSPC;
@@ -174,7 +175,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
char *ptr;
if (size > old_data_len) {
- if (btrfs_leaf_free_space(fs_info, leaf) <
+ if (btrfs_leaf_free_space(leaf) <
(size - old_data_len)) {
ret = -ENOSPC;
goto out;
@@ -184,17 +185,15 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
if (old_data_len + name_len + sizeof(*di) == item_size) {
/* No other xattrs packed in the same leaf item. */
if (size > old_data_len)
- btrfs_extend_item(fs_info, path,
- size - old_data_len);
+ btrfs_extend_item(path, size - old_data_len);
else if (size < old_data_len)
- btrfs_truncate_item(fs_info, path,
- data_size, 1);
+ btrfs_truncate_item(path, data_size, 1);
} else {
/* There are other xattrs packed in the same item. */
ret = btrfs_delete_one_dir_name(trans, root, path, di);
if (ret)
goto out;
- btrfs_extend_item(fs_info, path, data_size);
+ btrfs_extend_item(path, data_size);
}
item = btrfs_item_nr(slot);
@@ -214,36 +213,32 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
}
out:
btrfs_free_path(path);
+ if (!ret)
+ set_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags);
return ret;
}
/*
* @value: "" makes the attribute to empty, NULL removes it
*/
-int btrfs_setxattr(struct btrfs_trans_handle *trans,
- struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+int btrfs_setxattr_trans(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
int ret;
- if (btrfs_root_readonly(root))
- return -EROFS;
-
- if (trans)
- return do_setxattr(trans, inode, name, value, size, flags);
-
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = do_setxattr(trans, inode, name, value, size, flags);
+ ret = btrfs_setxattr(trans, inode, name, value, size, flags);
if (ret)
goto out;
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
- set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
out:
@@ -370,7 +365,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
size_t size, int flags)
{
name = xattr_full_name(handler, name);
- return btrfs_setxattr(NULL, inode, name, buffer, size, flags);
+ return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
@@ -378,8 +373,30 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
const char *name, const void *value,
size_t size, int flags)
{
+ int ret;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
name = xattr_full_name(handler, name);
- return btrfs_set_prop(inode, name, value, size, flags);
+ ret = btrfs_validate_prop(name, value, size);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_set_prop(trans, inode, name, value, size, flags);
+ if (!ret) {
+ inode_inc_iversion(inode);
+ inode->i_ctime = current_time(inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+ }
+
+ btrfs_end_transaction(trans);
+
+ return ret;
}
static const struct xattr_handler btrfs_security_xattr_handler = {
@@ -419,10 +436,10 @@ const struct xattr_handler *btrfs_xattr_handlers[] = {
};
static int btrfs_initxattrs(struct inode *inode,
- const struct xattr *xattr_array, void *fs_info)
+ const struct xattr *xattr_array, void *fs_private)
{
+ struct btrfs_trans_handle *trans = fs_private;
const struct xattr *xattr;
- struct btrfs_trans_handle *trans = fs_info;
unsigned int nofs_flag;
char *name;
int err = 0;
@@ -442,7 +459,7 @@ static int btrfs_initxattrs(struct inode *inode,
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
err = btrfs_setxattr(trans, inode, name, xattr->value,
- xattr->value_len, 0);
+ xattr->value_len, 0);
kfree(name);
if (err < 0)
break;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 471fcac6ff55..1cd3fc0a8f17 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -12,9 +12,10 @@ extern const struct xattr_handler *btrfs_xattr_handlers[];
int btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size);
-int btrfs_setxattr(struct btrfs_trans_handle *trans,
- struct inode *inode, const char *name,
- const void *value, size_t size, int flags);
+int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
+ const char *name, const void *value, size_t size, int flags);
+int btrfs_setxattr_trans(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags);
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b86b7ad6b900..a6c90a003c12 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -29,19 +29,9 @@ struct workspace {
static struct workspace_manager wsm;
-static void zlib_init_workspace_manager(void)
+struct list_head *zlib_get_workspace(unsigned int level)
{
- btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress);
-}
-
-static void zlib_cleanup_workspace_manager(void)
-{
- btrfs_cleanup_workspace_manager(&wsm);
-}
-
-static struct list_head *zlib_get_workspace(unsigned int level)
-{
- struct list_head *ws = btrfs_get_workspace(&wsm, level);
+ struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level);
struct workspace *workspace = list_entry(ws, struct workspace, list);
workspace->level = level;
@@ -49,12 +39,7 @@ static struct list_head *zlib_get_workspace(unsigned int level)
return ws;
}
-static void zlib_put_workspace(struct list_head *ws)
-{
- btrfs_put_workspace(&wsm, ws);
-}
-
-static void zlib_free_workspace(struct list_head *ws)
+void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -63,7 +48,7 @@ static void zlib_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *zlib_alloc_workspace(unsigned int level)
+struct list_head *zlib_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
int workspacesize;
@@ -88,13 +73,9 @@ fail:
return ERR_PTR(-ENOMEM);
}
-static int zlib_compress_pages(struct list_head *ws,
- struct address_space *mapping,
- u64 start,
- struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out)
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret;
@@ -228,7 +209,7 @@ out:
return ret;
}
-static int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
@@ -319,10 +300,9 @@ done:
return ret;
}
-static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen)
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
@@ -418,23 +398,8 @@ next:
return ret;
}
-static unsigned int zlib_set_level(unsigned int level)
-{
- if (!level)
- return BTRFS_ZLIB_DEFAULT_LEVEL;
-
- return min_t(unsigned int, level, 9);
-}
-
const struct btrfs_compress_op btrfs_zlib_compress = {
- .init_workspace_manager = zlib_init_workspace_manager,
- .cleanup_workspace_manager = zlib_cleanup_workspace_manager,
- .get_workspace = zlib_get_workspace,
- .put_workspace = zlib_put_workspace,
- .alloc_workspace = zlib_alloc_workspace,
- .free_workspace = zlib_free_workspace,
- .compress_pages = zlib_compress_pages,
- .decompress_bio = zlib_decompress_bio,
- .decompress = zlib_decompress,
- .set_level = zlib_set_level,
+ .workspace_manager = &wsm,
+ .max_level = 9,
+ .default_level = BTRFS_ZLIB_DEFAULT_LEVEL,
};
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 6b9e29d050f3..9a4871636c6c 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -17,6 +17,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/zstd.h>
+#include "misc.h"
#include "compression.h"
#include "ctree.h"
@@ -90,6 +91,8 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
return container_of(list, struct workspace, list);
}
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_alloc_workspace(unsigned int level);
/*
* zstd_reclaim_timer_fn - reclaim timer
* @t: timer
@@ -102,10 +105,10 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
- spin_lock(&wsm.lock);
+ spin_lock_bh(&wsm.lock);
if (list_empty(&wsm.lru_list)) {
- spin_unlock(&wsm.lock);
+ spin_unlock_bh(&wsm.lock);
return;
}
@@ -124,7 +127,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
level = victim->level;
list_del(&victim->lru_list);
list_del(&victim->list);
- wsm.ops->free_workspace(&victim->list);
+ zstd_free_workspace(&victim->list);
if (list_empty(&wsm.idle_ws[level - 1]))
clear_bit(level - 1, &wsm.active_map);
@@ -134,7 +137,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
if (!list_empty(&wsm.lru_list))
mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
- spin_unlock(&wsm.lock);
+ spin_unlock_bh(&wsm.lock);
}
/*
@@ -164,7 +167,7 @@ static void zstd_calc_ws_mem_sizes(void)
}
}
-static void zstd_init_workspace_manager(void)
+void zstd_init_workspace_manager(void)
{
struct list_head *ws;
int i;
@@ -180,7 +183,7 @@ static void zstd_init_workspace_manager(void)
for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
INIT_LIST_HEAD(&wsm.idle_ws[i]);
- ws = wsm.ops->alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
+ ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
if (IS_ERR(ws)) {
pr_warn(
"BTRFS: cannot preallocate zstd compression workspace\n");
@@ -190,22 +193,22 @@ static void zstd_init_workspace_manager(void)
}
}
-static void zstd_cleanup_workspace_manager(void)
+void zstd_cleanup_workspace_manager(void)
{
struct workspace *workspace;
int i;
- spin_lock(&wsm.lock);
+ spin_lock_bh(&wsm.lock);
for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
while (!list_empty(&wsm.idle_ws[i])) {
workspace = container_of(wsm.idle_ws[i].next,
struct workspace, list);
list_del(&workspace->list);
list_del(&workspace->lru_list);
- wsm.ops->free_workspace(&workspace->list);
+ zstd_free_workspace(&workspace->list);
}
}
- spin_unlock(&wsm.lock);
+ spin_unlock_bh(&wsm.lock);
del_timer_sync(&wsm.timer);
}
@@ -227,7 +230,7 @@ static struct list_head *zstd_find_workspace(unsigned int level)
struct workspace *workspace;
int i = level - 1;
- spin_lock(&wsm.lock);
+ spin_lock_bh(&wsm.lock);
for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
if (!list_empty(&wsm.idle_ws[i])) {
ws = wsm.idle_ws[i].next;
@@ -239,11 +242,11 @@ static struct list_head *zstd_find_workspace(unsigned int level)
list_del(&workspace->lru_list);
if (list_empty(&wsm.idle_ws[i]))
clear_bit(i, &wsm.active_map);
- spin_unlock(&wsm.lock);
+ spin_unlock_bh(&wsm.lock);
return ws;
}
}
- spin_unlock(&wsm.lock);
+ spin_unlock_bh(&wsm.lock);
return NULL;
}
@@ -257,7 +260,7 @@ static struct list_head *zstd_find_workspace(unsigned int level)
* attempt to allocate a new workspace. If we fail to allocate one due to
* memory pressure, go to sleep waiting for the max level workspace to free up.
*/
-static struct list_head *zstd_get_workspace(unsigned int level)
+struct list_head *zstd_get_workspace(unsigned int level)
{
struct list_head *ws;
unsigned int nofs_flag;
@@ -272,7 +275,7 @@ again:
return ws;
nofs_flag = memalloc_nofs_save();
- ws = wsm.ops->alloc_workspace(level);
+ ws = zstd_alloc_workspace(level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ws)) {
@@ -298,11 +301,11 @@ again:
* isn't set, it is also set here. Only the max level workspace tries and wakes
* up waiting workspaces.
*/
-static void zstd_put_workspace(struct list_head *ws)
+void zstd_put_workspace(struct list_head *ws)
{
struct workspace *workspace = list_to_workspace(ws);
- spin_lock(&wsm.lock);
+ spin_lock_bh(&wsm.lock);
/* A node is only taken off the lru if we are the corresponding level */
if (workspace->req_level == workspace->level) {
@@ -322,13 +325,13 @@ static void zstd_put_workspace(struct list_head *ws)
list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
workspace->req_level = 0;
- spin_unlock(&wsm.lock);
+ spin_unlock_bh(&wsm.lock);
if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
cond_wake_up(&wsm.wait);
}
-static void zstd_free_workspace(struct list_head *ws)
+void zstd_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -337,7 +340,7 @@ static void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *zstd_alloc_workspace(unsigned int level)
+struct list_head *zstd_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
@@ -363,13 +366,9 @@ fail:
return ERR_PTR(-ENOMEM);
}
-static int zstd_compress_pages(struct list_head *ws,
- struct address_space *mapping,
- u64 start,
- struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out)
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
ZSTD_CStream *stream;
@@ -544,7 +543,7 @@ out:
return ret;
}
-static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct page **pages_in = cb->compressed_pages;
@@ -622,10 +621,9 @@ done:
return ret;
}
-static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen)
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
ZSTD_DStream *stream;
@@ -707,23 +705,9 @@ finish:
return ret;
}
-static unsigned int zstd_set_level(unsigned int level)
-{
- if (!level)
- return ZSTD_BTRFS_DEFAULT_LEVEL;
-
- return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL);
-}
-
const struct btrfs_compress_op btrfs_zstd_compress = {
- .init_workspace_manager = zstd_init_workspace_manager,
- .cleanup_workspace_manager = zstd_cleanup_workspace_manager,
- .get_workspace = zstd_get_workspace,
- .put_workspace = zstd_put_workspace,
- .alloc_workspace = zstd_alloc_workspace,
- .free_workspace = zstd_free_workspace,
- .compress_pages = zstd_compress_pages,
- .decompress_bio = zstd_decompress_bio,
- .decompress = zstd_decompress,
- .set_level = zstd_set_level,
+ /* ZSTD uses own workspace manager */
+ .workspace_manager = NULL,
+ .max_level = ZSTD_BTRFS_MAX_LEVEL,
+ .default_level = ZSTD_BTRFS_DEFAULT_LEVEL,
};